prometheus/storage/local/storage_test.go

// Copyright 2014 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package local

import (
	"fmt"
	"hash/fnv"
	"math"
	"math/rand"
	"os"
	"runtime"
	"strconv"
	"sync/atomic"
	"testing"
	"testing/quick"
	"time"

	"github.com/prometheus/common/log"
	"github.com/prometheus/common/model"
	"golang.org/x/net/context"

	"github.com/prometheus/prometheus/storage/local/chunk"
	"github.com/prometheus/prometheus/storage/metric"
	"github.com/prometheus/prometheus/util/testutil"
)

func TestMatches(t *testing.T) {
	storage, closer := NewTestStorage(t, 2)
	defer closer.Close()

	storage.archiveHighWatermark = 90
	samples := make([]*model.Sample, 100)
	fingerprints := make(model.Fingerprints, 100)

	for i := range samples {
		metric := model.Metric{
			model.MetricNameLabel: model.LabelValue(fmt.Sprintf("test_metric_%d", i)),
			"label1":              model.LabelValue(fmt.Sprintf("test_%d", i/10)),
			"label2":              model.LabelValue(fmt.Sprintf("test_%d", (i+5)/10)),
			"all":                 "const",
		}
		samples[i] = &model.Sample{
			Metric:    metric,
			Timestamp: model.Time(i),
			Value:     model.SampleValue(i),
		}
		fingerprints[i] = metric.FastFingerprint()
	}
	for _, s := range samples {
		storage.Append(s)
	}
	storage.WaitForIndexing()

	// Archive every tenth metric.
	for i, fp := range fingerprints {
		if i%10 != 0 {
			continue
		}
		s, ok := storage.fpToSeries.get(fp)
		if !ok {
			t.Fatal("could not retrieve series for fp", fp)
		}
		storage.fpLocker.Lock(fp)
		storage.persistence.archiveMetric(fp, s.metric, s.firstTime(), s.lastTime)
		storage.fpLocker.Unlock(fp)
	}

	newMatcher := func(matchType metric.MatchType, name model.LabelName, value model.LabelValue) *metric.LabelMatcher {
		lm, err := metric.NewLabelMatcher(matchType, name, value)
		if err != nil {
			t.Fatalf("error creating label matcher: %s", err)
		}
		return lm
	}

	var matcherTests = []struct {
		matchers metric.LabelMatchers
		expected model.Fingerprints
	}{
		{
			matchers: metric.LabelMatchers{newMatcher(metric.Equal, "label1", "x")},
			expected: model.Fingerprints{},
		},
		{
			matchers: metric.LabelMatchers{newMatcher(metric.Equal, "label1", "test_0")},
			expected: fingerprints[:10],
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.Equal, "label1", "test_0"),
				newMatcher(metric.Equal, "label2", "test_1"),
			},
			expected: fingerprints[5:10],
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.Equal, "all", "const"),
				newMatcher(metric.NotEqual, "label1", "x"),
			},
			expected: fingerprints,
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.Equal, "all", "const"),
				newMatcher(metric.NotEqual, "label1", "test_0"),
			},
			expected: fingerprints[10:],
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.Equal, "all", "const"),
				newMatcher(metric.NotEqual, "label1", "test_0"),
				newMatcher(metric.NotEqual, "label1", "test_1"),
				newMatcher(metric.NotEqual, "label1", "test_2"),
			},
			expected: fingerprints[30:],
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.Equal, "label1", ""),
			},
			expected: fingerprints[:0],
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.NotEqual, "label1", "test_0"),
				newMatcher(metric.Equal, "label1", ""),
			},
			expected: fingerprints[:0],
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.NotEqual, "label1", "test_0"),
				newMatcher(metric.Equal, "label2", ""),
			},
			expected: fingerprints[:0],
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.Equal, "all", "const"),
				newMatcher(metric.NotEqual, "label1", "test_0"),
				newMatcher(metric.Equal, "not_existent", ""),
			},
			expected: fingerprints[10:],
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.RegexMatch, "label1", `test_[3-5]`),
			},
			expected: fingerprints[30:60],
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.Equal, "all", "const"),
				newMatcher(metric.RegexNoMatch, "label1", `test_[3-5]`),
			},
			expected: append(append(model.Fingerprints{}, fingerprints[:30]...), fingerprints[60:]...),
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.RegexMatch, "label1", `test_[3-5]`),
				newMatcher(metric.RegexMatch, "label2", `test_[4-6]`),
			},
			expected: fingerprints[35:60],
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.RegexMatch, "label1", `test_[3-5]`),
				newMatcher(metric.NotEqual, "label2", `test_4`),
			},
			expected: append(append(model.Fingerprints{}, fingerprints[30:35]...), fingerprints[45:60]...),
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.Equal, "label1", `nonexistent`),
				newMatcher(metric.RegexMatch, "label2", `test`),
			},
			expected: model.Fingerprints{},
		},
		{
			matchers: metric.LabelMatchers{
				newMatcher(metric.Equal, "label1", `test_0`),
				newMatcher(metric.RegexMatch, "label2", `nonexistent`),
			},
			expected: model.Fingerprints{},
		},
	}

	for _, mt := range matcherTests {
		metrics, err := storage.MetricsForLabelMatchers(
			context.Background(),
			model.Earliest, model.Latest,
			mt.matchers,
		)
		if err != nil {
			t.Fatal(err)
		}
		if len(mt.expected) != len(metrics) {
			t.Fatalf("expected %d matches for %q, found %d", len(mt.expected), mt.matchers, len(metrics))
		}
		for _, m := range metrics {
			fp1 := m.Metric.FastFingerprint()
			found := false
			for _, fp2 := range mt.expected {
				if fp1 == fp2 {
					found = true
					break
				}
			}
			if !found {
				t.Errorf("expected fingerprint %s for %q not in result", fp1, mt.matchers)
			}
		}
		// Smoketest for from/through.
		metrics, err = storage.MetricsForLabelMatchers(
			context.Background(),
			model.Earliest, -10000,
			mt.matchers,
		)
		if err != nil {
			t.Fatal(err)
		}
		if len(metrics) > 0 {
			t.Error("expected no matches with 'through' older than any sample")
		}
		metrics, err = storage.MetricsForLabelMatchers(
			context.Background(),
			10000, model.Latest,
			mt.matchers,
		)
		if err != nil {
			t.Fatal(err)
		}
		if len(metrics) > 0 {
			t.Error("expected no matches with 'from' newer than any sample")
		}
		// Now the tricky one, cut out something from the middle.
		var (
			from    model.Time = 25
			through model.Time = 75
		)
		metrics, err = storage.MetricsForLabelMatchers(
			context.Background(),
			from, through,
			mt.matchers,
		)
		if err != nil {
			t.Fatal(err)
		}
		expected := model.Fingerprints{}
		for _, fp := range mt.expected {
			i := 0
			for ; fingerprints[i] != fp && i < len(fingerprints); i++ {
			}
			if i == len(fingerprints) {
				t.Fatal("expected fingerprint does not exist")
			}
			if !model.Time(i).Before(from) && !model.Time(i).After(through) {
				expected = append(expected, fp)
			}
		}
		if len(expected) != len(metrics) {
			t.Errorf("expected %d range-limited matches for %q, found %d", len(expected), mt.matchers, len(metrics))
		}
		for _, m := range metrics {
			fp1 := m.Metric.FastFingerprint()
			found := false
			for _, fp2 := range expected {
				if fp1 == fp2 {
					found = true
					break
				}
			}
			if !found {
				t.Errorf("expected fingerprint %s for %q not in range-limited result", fp1, mt.matchers)
			}
		}

	}
}

func TestFingerprintsForLabels(t *testing.T) {
	storage, closer := NewTestStorage(t, 2)
	defer closer.Close()

	samples := make([]*model.Sample, 100)
	fingerprints := make(model.Fingerprints, 100)

	for i := range samples {
		metric := model.Metric{
			model.MetricNameLabel: model.LabelValue(fmt.Sprintf("test_metric_%d", i)),
			"label1":              model.LabelValue(fmt.Sprintf("test_%d", i/10)),
			"label2":              model.LabelValue(fmt.Sprintf("test_%d", (i+5)/10)),
		}
		samples[i] = &model.Sample{
			Metric:    metric,
			Timestamp: model.Time(i),
			Value:     model.SampleValue(i),
		}
		fingerprints[i] = metric.FastFingerprint()
	}
	for _, s := range samples {
		storage.Append(s)
	}
	storage.WaitForIndexing()

	var matcherTests = []struct {
		pairs    []model.LabelPair
		expected model.Fingerprints
	}{
		{
			pairs:    []model.LabelPair{{Name: "label1", Value: "x"}},
			expected: fingerprints[:0],
		},
		{
			pairs:    []model.LabelPair{{Name: "label1", Value: "test_0"}},
			expected: fingerprints[:10],
		},
		{
			pairs: []model.LabelPair{
				{Name: "label1", Value: "test_0"},
				{Name: "label1", Value: "test_1"},
			},
			expected: fingerprints[:0],
		},
		{
			pairs: []model.LabelPair{
				{Name: "label1", Value: "test_0"},
				{Name: "label2", Value: "test_1"},
			},
			expected: fingerprints[5:10],
		},
		{
			pairs: []model.LabelPair{
				{Name: "label1", Value: "test_1"},
				{Name: "label2", Value: "test_2"},
			},
			expected: fingerprints[15:20],
		},
	}

	for _, mt := range matcherTests {
		var resfps map[model.Fingerprint]struct{}
		for _, pair := range mt.pairs {
			resfps = storage.fingerprintsForLabelPair(pair, nil, resfps)
		}
		if len(mt.expected) != len(resfps) {
			t.Fatalf("expected %d matches for %q, found %d", len(mt.expected), mt.pairs, len(resfps))
		}
		for fp1 := range resfps {
			found := false
			for _, fp2 := range mt.expected {
				if fp1 == fp2 {
					found = true
					break
				}
			}
			if !found {
				t.Errorf("expected fingerprint %s for %q not in result", fp1, mt.pairs)
			}
		}
	}
}

var benchLabelMatchingRes []metric.Metric

func BenchmarkLabelMatching(b *testing.B) {
	s, closer := NewTestStorage(b, 2)
	defer closer.Close()

	h := fnv.New64a()
	lbl := func(x int) model.LabelValue {
		h.Reset()
		h.Write([]byte(fmt.Sprintf("%d", x)))
		return model.LabelValue(fmt.Sprintf("%d", h.Sum64()))
	}

	M := 32
	met := model.Metric{}
	for i := 0; i < M; i++ {
		met["label_a"] = lbl(i)
		for j := 0; j < M; j++ {
			met["label_b"] = lbl(j)
			for k := 0; k < M; k++ {
				met["label_c"] = lbl(k)
				for l := 0; l < M; l++ {
					met["label_d"] = lbl(l)
					s.Append(&model.Sample{
						Metric:    met.Clone(),
						Timestamp: 0,
						Value:     1,
					})
				}
			}
		}
	}
	s.WaitForIndexing()

	newMatcher := func(matchType metric.MatchType, name model.LabelName, value model.LabelValue) *metric.LabelMatcher {
		lm, err := metric.NewLabelMatcher(matchType, name, value)
		if err != nil {
			b.Fatalf("error creating label matcher: %s", err)
		}
		return lm
	}

	var matcherTests = []metric.LabelMatchers{
		{
			newMatcher(metric.Equal, "label_a", lbl(1)),
		},
		{
			newMatcher(metric.Equal, "label_a", lbl(3)),
			newMatcher(metric.Equal, "label_c", lbl(3)),
		},
		{
			newMatcher(metric.Equal, "label_a", lbl(3)),
			newMatcher(metric.Equal, "label_c", lbl(3)),
			newMatcher(metric.NotEqual, "label_d", lbl(3)),
		},
		{
			newMatcher(metric.Equal, "label_a", lbl(3)),
			newMatcher(metric.Equal, "label_b", lbl(3)),
			newMatcher(metric.Equal, "label_c", lbl(3)),
			newMatcher(metric.NotEqual, "label_d", lbl(3)),
		},
		{
			newMatcher(metric.RegexMatch, "label_a", ".+"),
		},
		{
			newMatcher(metric.Equal, "label_a", lbl(3)),
			newMatcher(metric.RegexMatch, "label_a", ".+"),
		},
		{
			newMatcher(metric.Equal, "label_a", lbl(1)),
			newMatcher(metric.RegexMatch, "label_c", "("+lbl(3)+"|"+lbl(10)+")"),
		},
		{
			newMatcher(metric.Equal, "label_a", lbl(3)),
			newMatcher(metric.Equal, "label_a", lbl(4)),
			newMatcher(metric.RegexMatch, "label_c", "("+lbl(3)+"|"+lbl(10)+")"),
		},
	}

	b.ReportAllocs()
	b.ResetTimer()

	var err error
	for i := 0; i < b.N; i++ {
		benchLabelMatchingRes = []metric.Metric{}
		for _, mt := range matcherTests {
			benchLabelMatchingRes, err = s.MetricsForLabelMatchers(
				context.Background(),
				model.Earliest, model.Latest,
				mt,
			)
			if err != nil {
				b.Fatal(err)
			}
		}
	}
	// Stop timer to not count the storage closing.
	b.StopTimer()
}

func BenchmarkQueryRange(b *testing.B) {
	now := model.Now()
	insertStart := now.Add(-2 * time.Hour)

	s, closer := NewTestStorage(b, 2)
	defer closer.Close()

	// Stop maintenance loop to prevent actual purging.
	close(s.loopStopping)
	<-s.loopStopped
	<-s.logThrottlingStopped
	// Recreate channel to avoid panic when we really shut down.
	s.loopStopping = make(chan struct{})

	for i := 0; i < 8192; i++ {
		s.Append(&model.Sample{
			Metric:    model.Metric{"__name__": model.LabelValue(strconv.Itoa(i)), "job": "test"},
			Timestamp: insertStart,
			Value:     1,
		})
	}
	s.WaitForIndexing()

	b.ResetTimer()
	b.RunParallel(func(pb *testing.PB) {
		lm, _ := metric.NewLabelMatcher(metric.Equal, "job", "test")
		for pb.Next() {
			s.QueryRange(context.Background(), insertStart, now, lm)
		}
	})
}

func TestQueryRangeThroughBeforeFrom(t *testing.T) {
	now := model.Now()
	insertStart := now.Add(-2 * time.Hour)

	s, closer := NewTestStorage(t, 2)
	defer closer.Close()

	// Stop maintenance loop to prevent actual purging.
	close(s.loopStopping)
	<-s.loopStopped
	<-s.logThrottlingStopped
	// Recreate channel to avoid panic when we really shut down.
	s.loopStopping = make(chan struct{})

	for i := 0; i < 8192; i++ {
		s.Append(&model.Sample{
			Metric:    model.Metric{"__name__": "testmetric", "job": "test"},
			Timestamp: insertStart.Add(time.Duration(i) * time.Second),
			Value:     model.SampleValue(rand.Float64()),
		})
	}
	s.WaitForIndexing()

	lm, _ := metric.NewLabelMatcher(metric.Equal, "job", "test")
	iters, err := s.QueryRange(context.Background(), now.Add(-30*time.Minute), now.Add(-90*time.Minute), lm)
	if err != nil {
		t.Error(err)
	}
	if len(iters) != 0 {
		t.Errorf("expected no iters to be returned, got %d", len(iters))
	}
}

func TestRetentionCutoff(t *testing.T) {
	now := model.Now()
	insertStart := now.Add(-2 * time.Hour)

	s, closer := NewTestStorage(t, 2)
	defer closer.Close()

	// Stop maintenance loop to prevent actual purging.
	close(s.loopStopping)
	<-s.loopStopped
	<-s.logThrottlingStopped
	// Recreate channel to avoid panic when we really shut down.
	s.loopStopping = make(chan struct{})

	s.dropAfter = 1 * time.Hour

	for i := 0; i < 120; i++ {
		smpl := &model.Sample{
			Metric:    model.Metric{"job": "test"},
			Timestamp: insertStart.Add(time.Duration(i) * time.Minute), // 1 minute intervals.
			Value:     1,
		}
		s.Append(smpl)
	}
	s.WaitForIndexing()

	lm, err := metric.NewLabelMatcher(metric.Equal, "job", "test")
	if err != nil {
		t.Fatalf("error creating label matcher: %s", err)
	}
	its, err := s.QueryRange(context.Background(), insertStart, now, lm)
	if err != nil {
		t.Fatal(err)
	}

	if len(its) != 1 {
		t.Fatalf("expected one iterator but got %d", len(its))
	}

	val := its[0].ValueAtOrBeforeTime(now.Add(-61 * time.Minute))
	if val.Timestamp != model.Earliest {
		t.Errorf("unexpected result for timestamp before retention period")
	}

	vals := its[0].RangeValues(metric.Interval{OldestInclusive: insertStart, NewestInclusive: now})
	// We get 59 values here because the model.Now() is slightly later
	// than our now.
	if len(vals) != 59 {
		t.Errorf("expected 59 values but got %d", len(vals))
	}
	if expt := now.Add(-1 * time.Hour).Add(time.Minute); vals[0].Timestamp != expt {
		t.Errorf("unexpected timestamp for first sample: %v, expected %v", vals[0].Timestamp.Time(), expt.Time())
	}
}

func TestDropMetrics(t *testing.T) {
	now := model.Now()
	insertStart := now.Add(-2 * time.Hour)

	s, closer := NewTestStorage(t, 2)
	defer closer.Close()

	chunkFileExists := func(fp model.Fingerprint) (bool, error) {
		f, err := s.persistence.openChunkFileForReading(fp)
		if err == nil {
			f.Close()
			return true, nil
		}
		if os.IsNotExist(err) {
			return false, nil
		}
		return false, err
	}

	m1 := model.Metric{model.MetricNameLabel: "test", "n1": "v1"}
	m2 := model.Metric{model.MetricNameLabel: "test", "n1": "v2"}
	m3 := model.Metric{model.MetricNameLabel: "test", "n1": "v3"}

	lm1, err := metric.NewLabelMatcher(metric.Equal, "n1", "v1")
	if err != nil {
		t.Fatal(err)
	}
	lmAll, err := metric.NewLabelMatcher(metric.Equal, model.MetricNameLabel, "test")
	if err != nil {
		t.Fatal(err)
	}

	N := 120000

	for j, m := range []model.Metric{m1, m2, m3} {
		for i := 0; i < N; i++ {
			smpl := &model.Sample{
				Metric:    m,
				Timestamp: insertStart.Add(time.Duration(i) * time.Millisecond), // 1 millisecond intervals.
				Value:     model.SampleValue(j),
			}
			s.Append(smpl)
		}
	}
	s.WaitForIndexing()

	// Archive m3, but first maintain it so that at least something is written to disk.
	fpToBeArchived := m3.FastFingerprint()
	s.maintainMemorySeries(fpToBeArchived, 0)
	s.fpLocker.Lock(fpToBeArchived)
	s.fpToSeries.del(fpToBeArchived)
	s.persistence.archiveMetric(fpToBeArchived, m3, 0, insertStart.Add(time.Duration(N-1)*time.Millisecond))
	s.fpLocker.Unlock(fpToBeArchived)

	fps := s.fingerprintsForLabelPair(model.LabelPair{
		Name: model.MetricNameLabel, Value: "test",
	}, nil, nil)
	if len(fps) != 3 {
		t.Errorf("unexpected number of fingerprints: %d", len(fps))
	}

	fpList := model.Fingerprints{m1.FastFingerprint(), m2.FastFingerprint(), fpToBeArchived}

	n, err := s.DropMetricsForLabelMatchers(context.Background(), lm1)
	if err != nil {
		t.Fatal(err)
	}
	if n != 1 {
		t.Fatalf("expected 1 series to be dropped, got %d", n)
	}
	s.WaitForIndexing()

	fps2 := s.fingerprintsForLabelPair(model.LabelPair{
		Name: model.MetricNameLabel, Value: "test",
	}, nil, nil)
	if len(fps2) != 2 {
		t.Errorf("unexpected number of fingerprints: %d", len(fps2))
	}

	it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fpList[0]), model.Earliest, model.Latest)
	if vals := it.RangeValues(metric.Interval{OldestInclusive: insertStart, NewestInclusive: now}); len(vals) != 0 {
		t.Errorf("unexpected number of samples: %d", len(vals))
	}

	it = s.preloadChunksForRange(makeFingerprintSeriesPair(s, fpList[1]), model.Earliest, model.Latest)
	if vals := it.RangeValues(metric.Interval{OldestInclusive: insertStart, NewestInclusive: now}); len(vals) != N {
		t.Errorf("unexpected number of samples: %d", len(vals))
	}
	exists, err := chunkFileExists(fpList[2])
	if err != nil {
		t.Fatal(err)
	}
	if !exists {
		t.Errorf("chunk file does not exist for fp=%v", fpList[2])
	}

	n, err = s.DropMetricsForLabelMatchers(context.Background(), lmAll)
	if err != nil {
		t.Fatal(err)
	}
	if n != 2 {
		t.Fatalf("expected 2 series to be dropped, got %d", n)
	}
	s.WaitForIndexing()

	fps3 := s.fingerprintsForLabelPair(model.LabelPair{
		Name: model.MetricNameLabel, Value: "test",
	}, nil, nil)
	if len(fps3) != 0 {
		t.Errorf("unexpected number of fingerprints: %d", len(fps3))
	}

	it = s.preloadChunksForRange(makeFingerprintSeriesPair(s, fpList[0]), model.Earliest, model.Latest)
	if vals := it.RangeValues(metric.Interval{OldestInclusive: insertStart, NewestInclusive: now}); len(vals) != 0 {
		t.Errorf("unexpected number of samples: %d", len(vals))
	}

	it = s.preloadChunksForRange(makeFingerprintSeriesPair(s, fpList[1]), model.Earliest, model.Latest)
	if vals := it.RangeValues(metric.Interval{OldestInclusive: insertStart, NewestInclusive: now}); len(vals) != 0 {
		t.Errorf("unexpected number of samples: %d", len(vals))
	}
	exists, err = chunkFileExists(fpList[2])
	if err != nil {
		t.Fatal(err)
	}
	if exists {
		t.Errorf("chunk file still exists for fp=%v", fpList[2])
	}
}

func TestQuarantineMetric(t *testing.T) {
	now := model.Now()
	insertStart := now.Add(-2 * time.Hour)

	s, closer := NewTestStorage(t, 2)
	defer closer.Close()

	chunkFileExists := func(fp model.Fingerprint) (bool, error) {
		f, err := s.persistence.openChunkFileForReading(fp)
		if err == nil {
			f.Close()
			return true, nil
		}
		if os.IsNotExist(err) {
			return false, nil
		}
		return false, err
	}

	m1 := model.Metric{model.MetricNameLabel: "test", "n1": "v1"}
	m2 := model.Metric{model.MetricNameLabel: "test", "n1": "v2"}
	m3 := model.Metric{model.MetricNameLabel: "test", "n1": "v3"}

	N := 120000

	for j, m := range []model.Metric{m1, m2, m3} {
		for i := 0; i < N; i++ {
			smpl := &model.Sample{
				Metric:    m,
				Timestamp: insertStart.Add(time.Duration(i) * time.Millisecond), // 1 millisecond intervals.
				Value:     model.SampleValue(j),
			}
			s.Append(smpl)
		}
	}
	s.WaitForIndexing()

	// Archive m3, but first maintain it so that at least something is written to disk.
	fpToBeArchived := m3.FastFingerprint()
	s.maintainMemorySeries(fpToBeArchived, 0)
	s.fpLocker.Lock(fpToBeArchived)
	s.fpToSeries.del(fpToBeArchived)
	s.persistence.archiveMetric(fpToBeArchived, m3, 0, insertStart.Add(time.Duration(N-1)*time.Millisecond))
	s.fpLocker.Unlock(fpToBeArchived)

	// Corrupt the series file for m3.
	f, err := os.Create(s.persistence.fileNameForFingerprint(fpToBeArchived))
	if err != nil {
		t.Fatal(err)
	}
	if _, err := f.WriteString("This is clearly not the content of a series file."); err != nil {
		t.Fatal(err)
	}
	if f.Close(); err != nil {
		t.Fatal(err)
	}

	fps := s.fingerprintsForLabelPair(model.LabelPair{
		Name: model.MetricNameLabel, Value: "test",
	}, nil, nil)
	if len(fps) != 3 {
		t.Errorf("unexpected number of fingerprints: %d", len(fps))
	}

	// This will access the corrupt file and lead to quarantining.
	iter := s.preloadChunksForInstant(makeFingerprintSeriesPair(s, fpToBeArchived), now.Add(-2*time.Hour-1*time.Minute), now.Add(-2*time.Hour))
	iter.Close()
	time.Sleep(time.Second) // Give time to quarantine. TODO(beorn7): Find a better way to wait.
	s.WaitForIndexing()

	fps2 := s.fingerprintsForLabelPair(model.LabelPair{
		Name: model.MetricNameLabel, Value: "test",
	}, nil, nil)
	if len(fps2) != 2 {
		t.Errorf("unexpected number of fingerprints: %d", len(fps2))
	}

	exists, err := chunkFileExists(fpToBeArchived)
	if err != nil {
		t.Fatal(err)
	}
	if exists {
		t.Errorf("chunk file exists for fp=%v", fpToBeArchived)
	}
}

// TestLoop is just a smoke test for the loop method, if we can switch it on and
// off without disaster.
func TestLoop(t *testing.T) {
	if testing.Short() {
		t.Skip("Skipping test in short mode.")
	}
	samples := make(model.Samples, 1000)
	for i := range samples {
		samples[i] = &model.Sample{
			Timestamp: model.Time(2 * i),
			Value:     model.SampleValue(float64(i) * 0.2),
		}
	}
	directory := testutil.NewTemporaryDirectory("test_storage", t)
	defer directory.Close()
	o := &MemorySeriesStorageOptions{
		TargetHeapSize:             100000,
		PersistenceRetentionPeriod: 24 * 7 * time.Hour,
		PersistenceStoragePath:     directory.Path(),
		HeadChunkTimeout:           5 * time.Minute,
		CheckpointInterval:         250 * time.Millisecond,
		SyncStrategy:               Adaptive,
		MinShrinkRatio:             0.1,
	}
	storage := NewMemorySeriesStorage(o)
	if err := storage.Start(); err != nil {
		t.Errorf("Error starting storage: %s", err)
	}
	for _, s := range samples {
		storage.Append(s)
	}
	storage.WaitForIndexing()
	fp := model.Metric{}.FastFingerprint()
	series, _ := storage.fpToSeries.get(fp)
	storage.fpLocker.Lock(fp)
	cdsBefore := len(series.chunkDescs)
	storage.fpLocker.Unlock(fp)
	time.Sleep(fpMaxWaitDuration + time.Second) // TODO(beorn7): Ugh, need to wait for maintenance to kick in.
	storage.fpLocker.Lock(fp)
	cdsAfter := len(series.chunkDescs)
	storage.fpLocker.Unlock(fp)
	storage.Stop()
	if cdsBefore <= cdsAfter {
		t.Errorf(
			"Number of chunk descriptors should have gone down by now. Got before %d, after %d.",
			cdsBefore, cdsAfter,
		)
	}
}

func testChunk(t *testing.T, encoding chunk.Encoding) {
	samples := make(model.Samples, 500000)
	for i := range samples {
		samples[i] = &model.Sample{
			Timestamp: model.Time(i),
			Value:     model.SampleValue(float64(i) * 0.2),
		}
	}
	s, closer := NewTestStorage(t, encoding)
	defer closer.Close()

	for _, sample := range samples {
		s.Append(sample)
	}
	s.WaitForIndexing()

	for m := range s.fpToSeries.iter() {
		s.fpLocker.Lock(m.fp)
		var values []model.SamplePair
		for _, cd := range m.series.chunkDescs {
			if cd.IsEvicted() {
				continue
			}
			it := cd.C.NewIterator()
			for it.Scan() {
				values = append(values, it.Value())
			}
			if it.Err() != nil {
				t.Error(it.Err())
			}
		}

		for i, v := range values {
			if samples[i].Timestamp != v.Timestamp {
				t.Errorf("%d. Got %v; want %v", i, v.Timestamp, samples[i].Timestamp)
			}
			if samples[i].Value != v.Value {
				t.Errorf("%d. Got %v; want %v", i, v.Value, samples[i].Value)
			}
		}
		s.fpLocker.Unlock(m.fp)
	}
	log.Info("test done, closing")
}

func TestChunkType0(t *testing.T) {
	testChunk(t, 0)
}

func TestChunkType1(t *testing.T) {
	testChunk(t, 1)
}

func TestChunkType2(t *testing.T) {
	testChunk(t, 2)
}

func testValueAtOrBeforeTime(t *testing.T, encoding chunk.Encoding) {
	samples := make(model.Samples, 10000)
	for i := range samples {
		samples[i] = &model.Sample{
			Timestamp: model.Time(2 * i),
			Value:     model.SampleValue(float64(i) * 0.2),
		}
	}
	s, closer := NewTestStorage(t, encoding)
	defer closer.Close()

	for _, sample := range samples {
		s.Append(sample)
	}
	s.WaitForIndexing()

	fp := model.Metric{}.FastFingerprint()

	it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), model.Earliest, model.Latest)

	// #1 Exactly on a sample.
	for i, expected := range samples {
		actual := it.ValueAtOrBeforeTime(expected.Timestamp)

		if expected.Timestamp != actual.Timestamp {
			t.Errorf("1.%d. Got %v; want %v", i, actual.Timestamp, expected.Timestamp)
		}
		if expected.Value != actual.Value {
			t.Errorf("1.%d. Got %v; want %v", i, actual.Value, expected.Value)
		}
	}

	// #2 Between samples.
	for i, expected := range samples {
		if i == len(samples)-1 {
			continue
		}
		actual := it.ValueAtOrBeforeTime(expected.Timestamp + 1)

		if expected.Timestamp != actual.Timestamp {
			t.Errorf("2.%d. Got %v; want %v", i, actual.Timestamp, expected.Timestamp)
		}
		if expected.Value != actual.Value {
			t.Errorf("2.%d. Got %v; want %v", i, actual.Value, expected.Value)
		}
	}

	// #3 Corner cases: Just before the first sample, just after the last.
	expected := &model.Sample{Timestamp: model.Earliest}
	actual := it.ValueAtOrBeforeTime(samples[0].Timestamp - 1)
	if expected.Timestamp != actual.Timestamp {
		t.Errorf("3.1. Got %v; want %v", actual.Timestamp, expected.Timestamp)
	}
	if expected.Value != actual.Value {
		t.Errorf("3.1. Got %v; want %v", actual.Value, expected.Value)
	}
	expected = samples[len(samples)-1]
	actual = it.ValueAtOrBeforeTime(expected.Timestamp + 1)
	if expected.Timestamp != actual.Timestamp {
		t.Errorf("3.2. Got %v; want %v", actual.Timestamp, expected.Timestamp)
	}
	if expected.Value != actual.Value {
		t.Errorf("3.2. Got %v; want %v", actual.Value, expected.Value)
	}

	// #4 Query alternatingly exactly on and just between timestamps.
	// Exposes issue #2965.
	for i, expected := range samples {
		i *= 2
		actual := it.ValueAtOrBeforeTime(expected.Timestamp)
		if expected.Timestamp != actual.Timestamp {
			t.Errorf("4.%d. Got %v; want %v", i, actual.Timestamp, expected.Timestamp)
		}
		if expected.Value != actual.Value {
			t.Errorf("4.%d. Got %v; want %v", i, actual.Value, expected.Value)
		}

		i++
		actual = it.ValueAtOrBeforeTime(expected.Timestamp + 1)
		if expected.Timestamp != actual.Timestamp {
			t.Errorf("4.%d. Got %v; want %v", i, actual.Timestamp, expected.Timestamp)
		}
		if expected.Value != actual.Value {
			t.Errorf("4.%d. Got %v; want %v", i, actual.Value, expected.Value)
		}
	}
}

func TestValueAtTimeChunkType0(t *testing.T) {
	testValueAtOrBeforeTime(t, 0)
}

func TestValueAtTimeChunkType1(t *testing.T) {
	testValueAtOrBeforeTime(t, 1)
}

func TestValueAtTimeChunkType2(t *testing.T) {
	testValueAtOrBeforeTime(t, 2)
}

func benchmarkValueAtOrBeforeTime(b *testing.B, encoding chunk.Encoding) {
	samples := make(model.Samples, 10000)
	for i := range samples {
		samples[i] = &model.Sample{
			Timestamp: model.Time(2 * i),
			Value:     model.SampleValue(float64(i) * 0.2),
		}
	}
	s, closer := NewTestStorage(b, encoding)
	defer closer.Close()

	for _, sample := range samples {
		s.Append(sample)
	}
	s.WaitForIndexing()

	fp := model.Metric{}.FastFingerprint()

	it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), model.Earliest, model.Latest)

	b.ResetTimer()

	for i := 0; i < b.N; i++ {
		// #1 Exactly on a sample.
		for i, expected := range samples {
			actual := it.ValueAtOrBeforeTime(expected.Timestamp)

			if expected.Timestamp != actual.Timestamp {
				b.Errorf("1.%d. Got %v; want %v", i, actual.Timestamp, expected.Timestamp)
			}
			if expected.Value != actual.Value {
				b.Errorf("1.%d. Got %v; want %v", i, actual.Value, expected.Value)
			}
		}

		// #2 Between samples.
		for i, expected := range samples {
			if i == len(samples)-1 {
				continue
			}
			actual := it.ValueAtOrBeforeTime(expected.Timestamp + 1)

			if expected.Timestamp != actual.Timestamp {
				b.Errorf("2.%d. Got %v; want %v", i, actual.Timestamp, expected.Timestamp)
			}
			if expected.Value != actual.Value {
				b.Errorf("2.%d. Got %v; want %v", i, actual.Value, expected.Value)
			}
		}

		// #3 Corner cases: Just before the first sample, just after the last.
		expected := &model.Sample{Timestamp: model.Earliest}
		actual := it.ValueAtOrBeforeTime(samples[0].Timestamp - 1)
		if expected.Timestamp != actual.Timestamp {
			b.Errorf("3.1. Got %v; want %v", actual.Timestamp, expected.Timestamp)
		}
		if expected.Value != actual.Value {
			b.Errorf("3.1. Got %v; want %v", actual.Value, expected.Value)
		}
		expected = samples[len(samples)-1]
		actual = it.ValueAtOrBeforeTime(expected.Timestamp + 1)
		if expected.Timestamp != actual.Timestamp {
			b.Errorf("3.2. Got %v; want %v", actual.Timestamp, expected.Timestamp)
		}
		if expected.Value != actual.Value {
			b.Errorf("3.2. Got %v; want %v", actual.Value, expected.Value)
		}
	}
}

func BenchmarkValueAtOrBeforeTimeChunkType0(b *testing.B) {
	benchmarkValueAtOrBeforeTime(b, 0)
}

func BenchmarkValueAtTimeChunkType1(b *testing.B) {
	benchmarkValueAtOrBeforeTime(b, 1)
}

func BenchmarkValueAtTimeChunkType2(b *testing.B) {
	benchmarkValueAtOrBeforeTime(b, 2)
}

func testRangeValues(t *testing.T, encoding chunk.Encoding) {
	samples := make(model.Samples, 10000)
	for i := range samples {
		samples[i] = &model.Sample{
			Timestamp: model.Time(2 * i),
			Value:     model.SampleValue(float64(i) * 0.2),
		}
	}
	s, closer := NewTestStorage(t, encoding)
	defer closer.Close()

	for _, sample := range samples {
		s.Append(sample)
	}
	s.WaitForIndexing()

	fp := model.Metric{}.FastFingerprint()

	it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), model.Earliest, model.Latest)

	// #1 Zero length interval at sample.
	for i, expected := range samples {
		actual := it.RangeValues(metric.Interval{
			OldestInclusive: expected.Timestamp,
			NewestInclusive: expected.Timestamp,
		})

		if len(actual) != 1 {
			t.Fatalf("1.%d. Expected exactly one result, got %d.", i, len(actual))
		}
		if expected.Timestamp != actual[0].Timestamp {
			t.Errorf("1.%d. Got %v; want %v.", i, actual[0].Timestamp, expected.Timestamp)
		}
		if expected.Value != actual[0].Value {
			t.Errorf("1.%d. Got %v; want %v.", i, actual[0].Value, expected.Value)
		}
	}

	// #2 Zero length interval off sample.
	for i, expected := range samples {
		actual := it.RangeValues(metric.Interval{
			OldestInclusive: expected.Timestamp + 1,
			NewestInclusive: expected.Timestamp + 1,
		})

		if len(actual) != 0 {
			t.Fatalf("2.%d. Expected no result, got %d.", i, len(actual))
		}
	}

	// #3 2sec interval around sample.
	for i, expected := range samples {
		actual := it.RangeValues(metric.Interval{
			OldestInclusive: expected.Timestamp - 1,
			NewestInclusive: expected.Timestamp + 1,
		})

		if len(actual) != 1 {
			t.Fatalf("3.%d. Expected exactly one result, got %d.", i, len(actual))
		}
		if expected.Timestamp != actual[0].Timestamp {
			t.Errorf("3.%d. Got %v; want %v.", i, actual[0].Timestamp, expected.Timestamp)
		}
		if expected.Value != actual[0].Value {
			t.Errorf("3.%d. Got %v; want %v.", i, actual[0].Value, expected.Value)
		}
	}

	// #4 2sec interval sample to sample.
	for i, expected1 := range samples {
		if i == len(samples)-1 {
			continue
		}
		expected2 := samples[i+1]
		actual := it.RangeValues(metric.Interval{
			OldestInclusive: expected1.Timestamp,
			NewestInclusive: expected1.Timestamp + 2,
		})

		if len(actual) != 2 {
			t.Fatalf("4.%d. Expected exactly 2 results, got %d.", i, len(actual))
		}
		if expected1.Timestamp != actual[0].Timestamp {
			t.Errorf("4.%d. Got %v for 1st result; want %v.", i, actual[0].Timestamp, expected1.Timestamp)
		}
		if expected1.Value != actual[0].Value {
			t.Errorf("4.%d. Got %v for 1st result; want %v.", i, actual[0].Value, expected1.Value)
		}
		if expected2.Timestamp != actual[1].Timestamp {
			t.Errorf("4.%d. Got %v for 2nd result; want %v.", i, actual[1].Timestamp, expected2.Timestamp)
		}
		if expected2.Value != actual[1].Value {
			t.Errorf("4.%d. Got %v for 2nd result; want %v.", i, actual[1].Value, expected2.Value)
		}
	}

	// #5 corner cases: Interval ends at first sample, interval starts
	// at last sample, interval entirely before/after samples.
	expected := samples[0]
	actual := it.RangeValues(metric.Interval{
		OldestInclusive: expected.Timestamp - 2,
		NewestInclusive: expected.Timestamp,
	})
	if len(actual) != 1 {
		t.Fatalf("5.1. Expected exactly one result, got %d.", len(actual))
	}
	if expected.Timestamp != actual[0].Timestamp {
		t.Errorf("5.1. Got %v; want %v.", actual[0].Timestamp, expected.Timestamp)
	}
	if expected.Value != actual[0].Value {
		t.Errorf("5.1. Got %v; want %v.", actual[0].Value, expected.Value)
	}
	expected = samples[len(samples)-1]
	actual = it.RangeValues(metric.Interval{
		OldestInclusive: expected.Timestamp,
		NewestInclusive: expected.Timestamp + 2,
	})
	if len(actual) != 1 {
		t.Fatalf("5.2. Expected exactly one result, got %d.", len(actual))
	}
	if expected.Timestamp != actual[0].Timestamp {
		t.Errorf("5.2. Got %v; want %v.", actual[0].Timestamp, expected.Timestamp)
	}
	if expected.Value != actual[0].Value {
		t.Errorf("5.2. Got %v; want %v.", actual[0].Value, expected.Value)
	}
	firstSample := samples[0]
	actual = it.RangeValues(metric.Interval{
		OldestInclusive: firstSample.Timestamp - 4,
		NewestInclusive: firstSample.Timestamp - 2,
	})
	if len(actual) != 0 {
		t.Fatalf("5.3. Expected no results, got %d.", len(actual))
	}
	lastSample := samples[len(samples)-1]
	actual = it.RangeValues(metric.Interval{
		OldestInclusive: lastSample.Timestamp + 2,
		NewestInclusive: lastSample.Timestamp + 4,
	})
	if len(actual) != 0 {
		t.Fatalf("5.3. Expected no results, got %d.", len(actual))
	}
}

func TestRangeValuesChunkType0(t *testing.T) {
	testRangeValues(t, 0)
}

func TestRangeValuesChunkType1(t *testing.T) {
	testRangeValues(t, 1)
}

func TestRangeValuesChunkType2(t *testing.T) {
	testRangeValues(t, 2)
}

func benchmarkRangeValues(b *testing.B, encoding chunk.Encoding) {
	samples := make(model.Samples, 10000)
	for i := range samples {
		samples[i] = &model.Sample{
			Timestamp: model.Time(2 * i),
			Value:     model.SampleValue(float64(i) * 0.2),
		}
	}
	s, closer := NewTestStorage(b, encoding)
	defer closer.Close()

	for _, sample := range samples {
		s.Append(sample)
	}
	s.WaitForIndexing()

	fp := model.Metric{}.FastFingerprint()

	it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), model.Earliest, model.Latest)

	b.ResetTimer()

	for i := 0; i < b.N; i++ {
		for _, sample := range samples {
			actual := it.RangeValues(metric.Interval{
				OldestInclusive: sample.Timestamp - 20,
				NewestInclusive: sample.Timestamp + 20,
			})

			if len(actual) < 10 {
				b.Fatalf("not enough samples found")
			}
		}
	}
}

func BenchmarkRangeValuesChunkType0(b *testing.B) {
	benchmarkRangeValues(b, 0)
}

func BenchmarkRangeValuesChunkType1(b *testing.B) {
	benchmarkRangeValues(b, 1)
}

func BenchmarkRangeValuesChunkType2(b *testing.B) {
	benchmarkRangeValues(b, 2)
}

func testEvictAndPurgeSeries(t *testing.T, encoding chunk.Encoding) {
	samples := make(model.Samples, 10000)
	for i := range samples {
		samples[i] = &model.Sample{
			Timestamp: model.Time(2 * i),
			Value:     model.SampleValue(float64(i * i)),
		}
	}
	s, closer := NewTestStorage(t, encoding)
	defer closer.Close()

	for _, sample := range samples {
		s.Append(sample)
	}
	s.WaitForIndexing()

	fp := model.Metric{}.FastFingerprint()

	// Drop ~half of the chunks.
	s.maintainMemorySeries(fp, 10000)
	it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), model.Earliest, model.Latest)
	actual := it.RangeValues(metric.Interval{
		OldestInclusive: 0,
		NewestInclusive: 100000,
	})
	if len(actual) < 4000 {
		t.Fatalf("expected more than %d results after purging half of series, got %d", 4000, len(actual))
	}
	if actual[0].Timestamp < 6000 || actual[0].Timestamp > 10000 {
		t.Errorf("1st timestamp out of expected range: %v", actual[0].Timestamp)
	}
	want := model.Time(19998)
	if actual[len(actual)-1].Timestamp != want {
		t.Errorf("2nd timestamp: want %v, got %v", want, actual[1].Timestamp)
	}

	// Drop everything.
	s.maintainMemorySeries(fp, 100000)
	it = s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), model.Earliest, model.Latest)
	actual = it.RangeValues(metric.Interval{
		OldestInclusive: 0,
		NewestInclusive: 100000,
	})
	if len(actual) != 0 {
		t.Fatal("expected zero results after purging the whole series")
	}

	// Recreate series.
	for _, sample := range samples {
		s.Append(sample)
	}
	s.WaitForIndexing()

	series, ok := s.fpToSeries.get(fp)
	if !ok {
		t.Fatal("could not find series")
	}

	// Persist head chunk so we can safely archive.
	series.headChunkClosed = true
	s.maintainMemorySeries(fp, model.Earliest)

	// Archive metrics.
	s.fpToSeries.del(fp)
	lastTime, err := series.head().LastTime()
	if err != nil {
		t.Fatal(err)
	}
	s.persistence.archiveMetric(fp, series.metric, series.firstTime(), lastTime)
	archived, _, _ := s.persistence.hasArchivedMetric(fp)
	if !archived {
		t.Fatal("not archived")
	}

	// Drop ~half of the chunks of an archived series.
	s.maintainArchivedSeries(fp, 10000)
	archived, _, _ = s.persistence.hasArchivedMetric(fp)
	if !archived {
		t.Fatal("archived series purged although only half of the chunks dropped")
	}

	// Drop everything.
	s.maintainArchivedSeries(fp, 100000)
	archived, _, _ = s.persistence.hasArchivedMetric(fp)
	if archived {
		t.Fatal("archived series not dropped")
	}

	// Recreate series.
	for _, sample := range samples {
		s.Append(sample)
	}
	s.WaitForIndexing()

	series, ok = s.fpToSeries.get(fp)
	if !ok {
		t.Fatal("could not find series")
	}

	// Persist head chunk so we can safely archive.
	series.headChunkClosed = true
	s.maintainMemorySeries(fp, model.Earliest)

	// Archive metrics.
	s.fpToSeries.del(fp)
	lastTime, err = series.head().LastTime()
	if err != nil {
		t.Fatal(err)
	}
	s.persistence.archiveMetric(fp, series.metric, series.firstTime(), lastTime)
	archived, _, _ = s.persistence.hasArchivedMetric(fp)
	if !archived {
		t.Fatal("not archived")
	}

	// Unarchive metrics.
	s.getOrCreateSeries(fp, model.Metric{})

	series, ok = s.fpToSeries.get(fp)
	if !ok {
		t.Fatal("could not find series")
	}
	archived, _, _ = s.persistence.hasArchivedMetric(fp)
	if archived {
		t.Fatal("archived")
	}

	// Set archiveHighWatermark to a low value so that we can see it increase.
	s.archiveHighWatermark = 42

	// This will archive again, but must not drop it completely, despite the
	// memorySeries being empty.
	s.maintainMemorySeries(fp, 10000)
	archived, _, _ = s.persistence.hasArchivedMetric(fp)
	if !archived {
		t.Fatal("series purged completely")
	}
	// archiveHighWatermark must have been set by maintainMemorySeries.
	if want, got := model.Time(19998), s.archiveHighWatermark; want != got {
		t.Errorf("want archiveHighWatermark %v, got %v", want, got)
	}
}

func TestEvictAndPurgeSeriesChunkType0(t *testing.T) {
	testEvictAndPurgeSeries(t, 0)
}

func TestEvictAndPurgeSeriesChunkType1(t *testing.T) {
	testEvictAndPurgeSeries(t, 1)
}

func TestEvictAndPurgeSeriesChunkType2(t *testing.T) {
	testEvictAndPurgeSeries(t, 2)
}

func testEvictAndLoadChunkDescs(t *testing.T, encoding chunk.Encoding) {
	samples := make(model.Samples, 10000)
	for i := range samples {
		samples[i] = &model.Sample{
			Timestamp: model.Time(2 * i),
			Value:     model.SampleValue(float64(i * i)),
		}
	}
	// Give last sample a timestamp of now so that the head chunk will not
	// be closed (which would then archive the time series later as
	// everything will get evicted).
	samples[len(samples)-1] = &model.Sample{
		Timestamp: model.Now(),
		Value:     model.SampleValue(3.14),
	}

	// Sadly, chunk.NumMemChunks is a global variable. We have to reset it
	// explicitly here.
	atomic.StoreInt64(&chunk.NumMemChunks, 0)

	s, closer := NewTestStorage(t, encoding)
	defer closer.Close()

	// Adjust target heap size to lower value to see evictions.
	s.targetHeapSize = 1000000

	for _, sample := range samples {
		s.Append(sample)
	}
	s.WaitForIndexing()

	fp := model.Metric{}.FastFingerprint()

	series, ok := s.fpToSeries.get(fp)
	if !ok {
		t.Fatal("could not find series")
	}

	oldLen := len(series.chunkDescs)
	// Maintain series without any dropped chunks.
	s.maintainMemorySeries(fp, 0)
	// Give the evict goroutine an opportunity to run.
	time.Sleep(1250 * time.Millisecond)
	// Maintain series again to trigger chunk.Desc eviction.
	s.maintainMemorySeries(fp, 0)

	if oldLen <= len(series.chunkDescs) {
		t.Errorf("Expected number of chunkDescs to decrease, old number %d, current number %d.", oldLen, len(series.chunkDescs))
	}
	if int64(len(series.chunkDescs)) < atomic.LoadInt64(&chunk.NumMemChunks) {
		t.Errorf("NumMemChunks is larger than number of chunk descs, number of chunk descs: %d, NumMemChunks: %d.", len(series.chunkDescs), atomic.LoadInt64(&chunk.NumMemChunks))
	}

	// Load everything back.
	it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), 0, 100000)

	if oldLen != len(series.chunkDescs) {
		t.Errorf("Expected number of chunkDescs to have reached old value again, old number %d, current number %d.", oldLen, len(series.chunkDescs))
	}

	it.Close()

	// Now maintain series with drops to make sure nothing crazy happens.
	s.maintainMemorySeries(fp, 100000)

	if len(series.chunkDescs) != 1 {
		t.Errorf("Expected exactly one chunk.Desc left, got %d.", len(series.chunkDescs))
	}
}

func TestEvictAndLoadChunkDescsType0(t *testing.T) {
	testEvictAndLoadChunkDescs(t, 0)
}

func TestEvictAndLoadChunkDescsType1(t *testing.T) {
	testEvictAndLoadChunkDescs(t, 1)
}

func benchmarkAppend(b *testing.B, encoding chunk.Encoding) {
	samples := make(model.Samples, b.N)
	for i := range samples {
		samples[i] = &model.Sample{
			Metric: model.Metric{
				model.MetricNameLabel: model.LabelValue(fmt.Sprintf("test_metric_%d", i%10)),
				"label1":              model.LabelValue(fmt.Sprintf("test_metric_%d", i%10)),
				"label2":              model.LabelValue(fmt.Sprintf("test_metric_%d", i%10)),
			},
			Timestamp: model.Time(i),
			Value:     model.SampleValue(i),
		}
	}
	b.ResetTimer()
	s, closer := NewTestStorage(b, encoding)
	defer closer.Close()

	for _, sample := range samples {
		s.Append(sample)
	}
}

func BenchmarkAppendType0(b *testing.B) {
	benchmarkAppend(b, 0)
}

func BenchmarkAppendType1(b *testing.B) {
	benchmarkAppend(b, 1)
}

func BenchmarkAppendType2(b *testing.B) {
	benchmarkAppend(b, 2)
}

// Append a large number of random samples and then check if we can get them out
// of the storage alright.
func testFuzz(t *testing.T, encoding chunk.Encoding) {
	if testing.Short() {
		t.Skip("Skipping test in short mode.")
	}

	check := func(seed int64) bool {
		rand.Seed(seed)
		s, c := NewTestStorage(t, encoding)
		defer c.Close()

		samples := createRandomSamples("test_fuzz", 10000)
		for _, sample := range samples {
			s.Append(sample)
		}
		if !verifyStorageRandom(t, s, samples) {
			return false
		}
		return verifyStorageSequential(t, s, samples)
	}

	if err := quick.Check(check, nil); err != nil {
		t.Fatal(err)
	}
}

func TestFuzzChunkType0(t *testing.T) {
	testFuzz(t, 0)
}

func TestFuzzChunkType1(t *testing.T) {
	testFuzz(t, 1)
}

func TestFuzzChunkType2(t *testing.T) {
	testFuzz(t, 2)
}

// benchmarkFuzz is the benchmark version of testFuzz. The storage options are
// set such that evictions, checkpoints, and purging will happen concurrently,
// too. This benchmark will have a very long runtime (up to minutes). You can
// use it as an actual benchmark. Run it like this:
//
// go test -cpu 1,2,4,8 -run=NONE -bench BenchmarkFuzzChunkType -benchmem
//
// You can also use it as a test for races. In that case, run it like this (will
// make things even slower):
//
// go test -race -cpu 8 -short -bench BenchmarkFuzzChunkType
func benchmarkFuzz(b *testing.B, encoding chunk.Encoding) {
	chunk.DefaultEncoding = encoding
	const samplesPerRun = 100000
	rand.Seed(42)
	directory := testutil.NewTemporaryDirectory("test_storage", b)
	defer directory.Close()
	o := &MemorySeriesStorageOptions{
		TargetHeapSize:             200000,
		PersistenceRetentionPeriod: time.Hour,
		PersistenceStoragePath:     directory.Path(),
		HeadChunkTimeout:           5 * time.Minute,
		CheckpointInterval:         time.Second,
		SyncStrategy:               Adaptive,
		MinShrinkRatio:             0.1,
	}
	s := NewMemorySeriesStorage(o)
	if err := s.Start(); err != nil {
		b.Fatalf("Error starting storage: %s", err)
	}
	s.Start()
	defer s.Stop()

	samples := createRandomSamples("benchmark_fuzz", samplesPerRun*b.N)

	b.ResetTimer()

	for i := 0; i < b.N; i++ {
		start := samplesPerRun * i
		end := samplesPerRun * (i + 1)
		middle := (start + end) / 2
		for _, sample := range samples[start:middle] {
			s.Append(sample)
		}
		verifyStorageRandom(b, s, samples[:middle])
		for _, sample := range samples[middle:end] {
			s.Append(sample)
		}
		verifyStorageRandom(b, s, samples[:end])
		verifyStorageSequential(b, s, samples)
	}
}

func BenchmarkFuzzChunkType0(b *testing.B) {
	benchmarkFuzz(b, 0)
}

func BenchmarkFuzzChunkType1(b *testing.B) {
	benchmarkFuzz(b, 1)
}

func BenchmarkFuzzChunkType2(b *testing.B) {
	benchmarkFuzz(b, 2)
}

func createRandomSamples(metricName string, minLen int) model.Samples {
	type valueCreator func() model.SampleValue
	type deltaApplier func(model.SampleValue) model.SampleValue

	var (
		maxMetrics      = 5
		maxStreakLength = 2000
		maxTimeDelta    = 10000
		timestamp       = model.Now() - model.Time(maxTimeDelta*minLen) // So that some timestamps are in the future.
		generators      = []struct {
			createValue valueCreator
			applyDelta  []deltaApplier
		}{
			{ // "Boolean".
				createValue: func() model.SampleValue {
					return model.SampleValue(rand.Intn(2))
				},
				applyDelta: []deltaApplier{
					func(_ model.SampleValue) model.SampleValue {
						return model.SampleValue(rand.Intn(2))
					},
				},
			},
			{ // Integer with int deltas of various byte length.
				createValue: func() model.SampleValue {
					return model.SampleValue(rand.Int63() - 1<<62)
				},
				applyDelta: []deltaApplier{
					func(v model.SampleValue) model.SampleValue {
						return model.SampleValue(rand.Intn(1<<8) - 1<<7 + int(v))
					},
					func(v model.SampleValue) model.SampleValue {
						return model.SampleValue(rand.Intn(1<<16) - 1<<15 + int(v))
					},
					func(v model.SampleValue) model.SampleValue {
						return model.SampleValue(rand.Int63n(1<<32) - 1<<31 + int64(v))
					},
				},
			},
			{ // Float with float32 and float64 deltas.
				createValue: func() model.SampleValue {
					return model.SampleValue(rand.NormFloat64())
				},
				applyDelta: []deltaApplier{
					func(v model.SampleValue) model.SampleValue {
						return v + model.SampleValue(float32(rand.NormFloat64()))
					},
					func(v model.SampleValue) model.SampleValue {
						return v + model.SampleValue(rand.NormFloat64())
					},
				},
			},
		}
		timestampIncrementers = []func(baseDelta model.Time) model.Time{
			// Regular increments.
			func(delta model.Time) model.Time {
				return delta
			},
			// Jittered increments. σ is 1/100 of delta, e.g. 10ms for 10s scrape interval.
			func(delta model.Time) model.Time {
				return delta + model.Time(rand.NormFloat64()*float64(delta)/100)
			},
			// Regular increments, but missing a scrape with 10% chance.
			func(delta model.Time) model.Time {
				i := rand.Intn(100)
				if i < 90 {
					return delta
				}
				if i < 99 {
					return 2 * delta
				}
				return 3 * delta
				// Ignoring the case with more than two missed scrapes in a row.
			},
		}
	)

	// Prefill result with two samples with colliding metrics (to test fingerprint mapping).
	result := model.Samples{
		&model.Sample{
			Metric: model.Metric{
				"instance": "ip-10-33-84-73.l05.ams5.s-cloud.net:24483",
				"status":   "503",
			},
			Value:     42,
			Timestamp: timestamp,
		},
		&model.Sample{
			Metric: model.Metric{
				"instance": "ip-10-33-84-73.l05.ams5.s-cloud.net:24480",
				"status":   "500",
			},
			Value:     2010,
			Timestamp: timestamp + 1,
		},
	}

	metrics := []model.Metric{}
	for n := rand.Intn(maxMetrics); n >= 0; n-- {
		metrics = append(metrics, model.Metric{
			model.MetricNameLabel:                             model.LabelValue(metricName),
			model.LabelName(fmt.Sprintf("labelname_%d", n+1)): model.LabelValue(fmt.Sprintf("labelvalue_%d", rand.Int())),
		})
	}

	for len(result) < minLen {
		var (
			// Pick a metric for this cycle.
			metric       = metrics[rand.Intn(len(metrics))]
			timeDelta    = model.Time(rand.Intn(maxTimeDelta) + 1)
			generator    = generators[rand.Intn(len(generators))]
			createValue  = generator.createValue
			applyDelta   = generator.applyDelta[rand.Intn(len(generator.applyDelta))]
			incTimestamp = timestampIncrementers[rand.Intn(len(timestampIncrementers))]
		)

		switch rand.Intn(4) {
		case 0: // A single sample.
			result = append(result, &model.Sample{
				Metric:    metric,
				Value:     createValue(),
				Timestamp: timestamp,
			})
			timestamp += incTimestamp(timeDelta)
		case 1: // A streak of random sample values.
			for n := rand.Intn(maxStreakLength); n >= 0; n-- {
				result = append(result, &model.Sample{
					Metric:    metric,
					Value:     createValue(),
					Timestamp: timestamp,
				})
				timestamp += incTimestamp(timeDelta)
			}
		case 2: // A streak of sample values with incremental changes.
			value := createValue()
			for n := rand.Intn(maxStreakLength); n >= 0; n-- {
				result = append(result, &model.Sample{
					Metric:    metric,
					Value:     value,
					Timestamp: timestamp,
				})
				timestamp += incTimestamp(timeDelta)
				value = applyDelta(value)
			}
		case 3: // A streak of constant sample values.
			value := createValue()
			for n := rand.Intn(maxStreakLength); n >= 0; n-- {
				result = append(result, &model.Sample{
					Metric:    metric,
					Value:     value,
					Timestamp: timestamp,
				})
				timestamp += incTimestamp(timeDelta)
			}
		}
	}

	return result
}

func verifyStorageRandom(t testing.TB, s *MemorySeriesStorage, samples model.Samples) bool {
	s.WaitForIndexing()
	result := true
	for _, i := range rand.Perm(len(samples)) {
		sample := samples[i]
		fp := s.mapper.mapFP(sample.Metric.FastFingerprint(), sample.Metric)
		it := s.preloadChunksForInstant(makeFingerprintSeriesPair(s, fp), sample.Timestamp, sample.Timestamp)
		found := it.ValueAtOrBeforeTime(sample.Timestamp)
		startTime := it.(*boundedIterator).start
		switch {
		case found.Timestamp != model.Earliest && sample.Timestamp.Before(startTime):
			t.Errorf("Sample #%d %#v: Expected outdated sample to be excluded.", i, sample)
			result = false
		case found.Timestamp == model.Earliest && !sample.Timestamp.Before(startTime):
			t.Errorf("Sample #%d %#v: Expected sample not found.", i, sample)
			result = false
		case found.Timestamp == model.Earliest && sample.Timestamp.Before(startTime):
			// All good. Outdated sample dropped.
		case sample.Value != found.Value || sample.Timestamp != found.Timestamp:
			t.Errorf(
				"Sample #%d %#v: Value (or timestamp) mismatch, want %f (at time %v), got %f (at time %v).",
				i, sample, sample.Value, sample.Timestamp, found.Value, found.Timestamp,
			)
			result = false
		}
		it.Close()
	}
	return result
}

func verifyStorageSequential(t testing.TB, s *MemorySeriesStorage, samples model.Samples) bool {
	s.WaitForIndexing()
	var (
		result = true
		fp     model.Fingerprint
		it     SeriesIterator
		r      []model.SamplePair
		j      int
	)
	defer func() {
		it.Close()
	}()
	for i, sample := range samples {
		newFP := s.mapper.mapFP(sample.Metric.FastFingerprint(), sample.Metric)
		if it == nil || newFP != fp {
			fp = newFP
			if it != nil {
				it.Close()
			}
			it = s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), sample.Timestamp, model.Latest)
			r = it.RangeValues(metric.Interval{
				OldestInclusive: sample.Timestamp,
				NewestInclusive: model.Latest,
			})
			j = -1
		}
		startTime := it.(*boundedIterator).start
		if sample.Timestamp.Before(startTime) {
			continue
		}
		j++
		if j >= len(r) {
			t.Errorf(
				"Sample #%d %v not found.",
				i, sample,
			)
			result = false
			continue
		}
		found := r[j]
		if sample.Value != found.Value || sample.Timestamp != found.Timestamp {
			t.Errorf(
				"Sample #%d %v: Value (or timestamp) mismatch, want %f (at time %v), got %f (at time %v).",
				i, sample, sample.Value, sample.Timestamp, found.Value, found.Timestamp,
			)
			result = false
		}
	}
	return result
}

func TestAppendOutOfOrder(t *testing.T) {
	s, closer := NewTestStorage(t, 2)
	defer closer.Close()

	m := model.Metric{
		model.MetricNameLabel: "out_of_order",
	}

	tests := []struct {
		name      string
		timestamp model.Time
		value     model.SampleValue
		wantErr   error
	}{
		{
			name:      "1st sample",
			timestamp: 0,
			value:     0,
			wantErr:   nil,
		},
		{
			name:      "regular append",
			timestamp: 2,
			value:     1,
			wantErr:   nil,
		},
		{
			name:      "same timestamp, same value (no-op)",
			timestamp: 2,
			value:     1,
			wantErr:   nil,
		},
		{
			name:      "same timestamp, different value",
			timestamp: 2,
			value:     2,
			wantErr:   ErrDuplicateSampleForTimestamp,
		},
		{
			name:      "earlier timestamp, same value",
			timestamp: 1,
			value:     2,
			wantErr:   ErrOutOfOrderSample,
		},
		{
			name:      "earlier timestamp, different value",
			timestamp: 1,
			value:     3,
			wantErr:   ErrOutOfOrderSample,
		},
		{
			name:      "regular append of NaN",
			timestamp: 3,
			value:     model.SampleValue(math.NaN()),
			wantErr:   nil,
		},
		{
			name:      "no-op append of NaN",
			timestamp: 3,
			value:     model.SampleValue(math.NaN()),
			wantErr:   nil,
		},
		{
			name:      "append of NaN with earlier timestamp",
			timestamp: 2,
			value:     model.SampleValue(math.NaN()),
			wantErr:   ErrOutOfOrderSample,
		},
		{
			name:      "append of normal sample after NaN with same timestamp",
			timestamp: 3,
			value:     3.14,
			wantErr:   ErrDuplicateSampleForTimestamp,
		},
	}

	for _, test := range tests {
		gotErr := s.Append(&model.Sample{
			Metric:    m,
			Timestamp: test.timestamp,
			Value:     test.value,
		})
		if gotErr != test.wantErr {
			t.Errorf("%s: got %q, want %q", test.name, gotErr, test.wantErr)
		}
	}

	fp := s.mapper.mapFP(m.FastFingerprint(), m)

	it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), 0, 2)
	defer it.Close()

	want := []model.SamplePair{
		{
			Timestamp: 0,
			Value:     0,
		},
		{
			Timestamp: 2,
			Value:     1,
		},
		{
			Timestamp: 3,
			Value:     model.SampleValue(math.NaN()),
		},
	}
	got := it.RangeValues(metric.Interval{OldestInclusive: 0, NewestInclusive: 3})
	// Note that we cannot just reflect.DeepEqual(want, got) because it has
	// the semantics of NaN != NaN.
	for i, gotSamplePair := range got {
		wantSamplePair := want[i]
		if !wantSamplePair.Equal(&gotSamplePair) {
			t.Fatalf("want %v, got %v", wantSamplePair, gotSamplePair)
		}
	}
}

func TestCalculatePersistUrgency(t *testing.T) {
	tests := map[string]struct {
		persistUrgency                        int32
		lenEvictList                          int
		numChunksToPersist                    int64
		targetHeapSize, msNextGC, msHeapAlloc uint64
		msNumGC, lastNumGC                    uint32

		wantPersistUrgency int32
		wantChunksToEvict  int
		wantLastNumGC      uint32
	}{
		"all zeros": {
			persistUrgency:     0,
			lenEvictList:       0,
			numChunksToPersist: 0,
			targetHeapSize:     0,
			msNextGC:           0,
			msHeapAlloc:        0,
			msNumGC:            0,
			lastNumGC:          0,

			wantPersistUrgency: 0,
			wantChunksToEvict:  0,
			wantLastNumGC:      0,
		},
		"far from target heap size, plenty of chunks to persist, GC has happened": {
			persistUrgency:     500,
			lenEvictList:       1000,
			numChunksToPersist: 100,
			targetHeapSize:     1000000,
			msNextGC:           500000,
			msHeapAlloc:        400000,
			msNumGC:            42,
			lastNumGC:          41,

			wantPersistUrgency: 45,
			wantChunksToEvict:  0,
			wantLastNumGC:      42,
		},
		"far from target heap size, plenty of chunks to persist, GC hasn't happened, urgency must not decrease": {
			persistUrgency:     500,
			lenEvictList:       1000,
			numChunksToPersist: 100,
			targetHeapSize:     1000000,
			msNextGC:           500000,
			msHeapAlloc:        400000,
			msNumGC:            42,
			lastNumGC:          42,

			wantPersistUrgency: 500,
			wantChunksToEvict:  0,
			wantLastNumGC:      42,
		},
		"far from target heap size but no chunks to persist": {
			persistUrgency:     50,
			lenEvictList:       0,
			numChunksToPersist: 100,
			targetHeapSize:     1000000,
			msNextGC:           500000,
			msHeapAlloc:        400000,
			msNumGC:            42,
			lastNumGC:          41,

			wantPersistUrgency: 500,
			wantChunksToEvict:  0,
			wantLastNumGC:      42,
		},
		"far from target heap size but no chunks to persist, HeapAlloc > NextGC": {
			persistUrgency:     50,
			lenEvictList:       0,
			numChunksToPersist: 100,
			targetHeapSize:     1000000,
			msNextGC:           500000,
			msHeapAlloc:        600000,
			msNumGC:            42,
			lastNumGC:          41,

			wantPersistUrgency: 600,
			wantChunksToEvict:  0,
			wantLastNumGC:      42,
		},
		"target heap size exceeded but GC hasn't happened": {
			persistUrgency:     50,
			lenEvictList:       3000,
			numChunksToPersist: 1000,
			targetHeapSize:     1000000,
			msNextGC:           1100000,
			msHeapAlloc:        900000,
			msNumGC:            42,
			lastNumGC:          42,

			wantPersistUrgency: 275,
			wantChunksToEvict:  0,
			wantLastNumGC:      42,
		},
		"target heap size exceeded, GC has happened": {
			persistUrgency:     50,
			lenEvictList:       3000,
			numChunksToPersist: 1000,
			targetHeapSize:     1000000,
			msNextGC:           1100000,
			msHeapAlloc:        900000,
			msNumGC:            42,
			lastNumGC:          41,

			wantPersistUrgency: 275,
			wantChunksToEvict:  97,
			wantLastNumGC:      42,
		},
		"target heap size exceeded, GC has happened, urgency bumped due to low number of evictable chunks": {
			persistUrgency:     50,
			lenEvictList:       300,
			numChunksToPersist: 100,
			targetHeapSize:     1000000,
			msNextGC:           1100000,
			msHeapAlloc:        900000,
			msNumGC:            42,
			lastNumGC:          41,

			wantPersistUrgency: 323,
			wantChunksToEvict:  97,
			wantLastNumGC:      42,
		},
		"target heap size exceeded but no evictable chunks and GC hasn't happened": {
			persistUrgency:     50,
			lenEvictList:       0,
			numChunksToPersist: 1000,
			targetHeapSize:     1000000,
			msNextGC:           1100000,
			msHeapAlloc:        900000,
			msNumGC:            42,
			lastNumGC:          42,

			wantPersistUrgency: 1000,
			wantChunksToEvict:  0,
			wantLastNumGC:      42,
		},
		"target heap size exceeded but no evictable chunks and GC has happened": {
			persistUrgency:     50,
			lenEvictList:       0,
			numChunksToPersist: 1000,
			targetHeapSize:     1000000,
			msNextGC:           1100000,
			msHeapAlloc:        900000,
			msNumGC:            42,
			lastNumGC:          41,

			wantPersistUrgency: 1000,
			wantChunksToEvict:  0,
			wantLastNumGC:      42,
		},
		"target heap size exceeded, very few evictable chunks, GC hasn't happened": {
			persistUrgency:     50,
			lenEvictList:       10,
			numChunksToPersist: 1000,
			targetHeapSize:     1000000,
			msNextGC:           1100000,
			msHeapAlloc:        900000,
			msNumGC:            42,
			lastNumGC:          42,

			wantPersistUrgency: 1000,
			wantChunksToEvict:  0,
			wantLastNumGC:      42,
		},
		"target heap size exceeded, some evictable chunks (but not enough), GC hasn't happened": {
			persistUrgency:     50,
			lenEvictList:       50,
			numChunksToPersist: 250,
			targetHeapSize:     1000000,
			msNextGC:           1100000,
			msHeapAlloc:        900000,
			msNumGC:            42,
			lastNumGC:          42,

			wantPersistUrgency: 916,
			wantChunksToEvict:  0,
			wantLastNumGC:      42,
		},
		"target heap size exceeded, some evictable chunks (but not enough), GC has happened": {
			persistUrgency:     50,
			lenEvictList:       50,
			numChunksToPersist: 250,
			targetHeapSize:     1000000,
			msNextGC:           1100000,
			msHeapAlloc:        900000,
			msNumGC:            42,
			lastNumGC:          41,

			wantPersistUrgency: 1000,
			wantChunksToEvict:  50,
			wantLastNumGC:      42,
		},
	}

	s, closer := NewTestStorage(t, 1)
	defer closer.Close()

	for scenario, test := range tests {
		s.persistUrgency = test.persistUrgency
		s.numChunksToPersist = test.numChunksToPersist
		s.targetHeapSize = test.targetHeapSize
		s.lastNumGC = test.lastNumGC
		s.evictList.Init()
		for i := 0; i < test.lenEvictList; i++ {
			s.evictList.PushBack(&struct{}{})
		}
		ms := runtime.MemStats{
			NextGC:    test.msNextGC,
			HeapAlloc: test.msHeapAlloc,
			NumGC:     test.msNumGC,
		}
		chunksToEvict := s.calculatePersistUrgency(&ms)

		if chunksToEvict != test.wantChunksToEvict {
			t.Errorf(
				"scenario %q: got %d chunks to evict, want %d",
				scenario, chunksToEvict, test.wantChunksToEvict,
			)
		}
		if s.persistUrgency != test.wantPersistUrgency {
			t.Errorf(
				"scenario %q: got persist urgency %d, want %d",
				scenario, s.persistUrgency, test.wantPersistUrgency,
			)
		}
		if s.lastNumGC != test.wantLastNumGC {
			t.Errorf(
				"scenario %q: got lastNumGC %d , want %d",
				scenario, s.lastNumGC, test.wantLastNumGC,
			)
		}
	}
}
-												Clean up license issues.

- Move CONTRIBUTORS.md to the more common AUTHORS.
- Added the required NOTICE file.
- Changed "Prometheus Team" to "The Prometheus Authors".
- Reverted the erroneous changes to the Apache License.

											
										
										
											2015-01-21 11:07:45 -08:00
+								// Copyright 2014 The Prometheus Authors
-												More code cleanups.

Add license text everywhere.
And others....

Change-Id: I11ccde267a2ef7eb366c4788ba7aeae14ba7545c

											
										
										
											2014-09-19 09:18:44 -07:00
+								// Licensed under the Apache License, Version 2.0 (the "License");
 								// you may not use this file except in compliance with the License.
 								// You may obtain a copy of the License at
 								//
 								// http://www.apache.org/licenses/LICENSE-2.0
 								//
 								// Unless required by applicable law or agreed to in writing, software
 								// distributed under the License is distributed on an "AS IS" BASIS,
 								// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								// See the License for the specific language governing permissions and
 								// limitations under the License.
-												Major code cleanup.

- Make it go-vet and golint clean.
- Add comments, TODOs, etc.

Change-Id: If1392d96f3d5b4cdde597b10c8dff1769fcfabe2

											
										
										
											2014-09-16 06:47:24 -07:00
+								package local
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
 								import (
 									"fmt"
-												storage/local: add benchmark for label matching.

											
										
										
											2015-06-22 03:02:03 -07:00
+									"hash/fnv"
-												Improve TestAppendOutOfOrder

It did not test the returned error so far.
Also, add tests for the NaN case broken before
https://github.com/prometheus/common/pull/40

											
										
										
											2016-05-20 04:46:33 -07:00
+									"math"
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+									"math/rand"
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+									"os"
-												storage: Evict chunks and calculate persistence pressure based on target heap size

This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".

The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).

This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.

Answers to anticipated comments:

There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.

One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.

											
										
										
											2017-03-01 06:17:31 -08:00
+									"runtime"
-												Added BenchmarkQueryRange

											
										
										
											2016-09-18 14:58:39 -07:00
+									"strconv"
-												Re-add counting of evict chunk ops and decrementing NumMemChunks

Also, modify test to expose the regression.

											
										
										
											2016-10-10 07:30:10 -07:00
+									"sync/atomic"
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+									"testing"
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+									"testing/quick"
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+									"time"
-												Switch to common/log

											
										
										
											2015-10-03 01:21:43 -07:00
+									"github.com/prometheus/common/log"
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									"github.com/prometheus/common/model"
-												storage: Contextify storage interfaces.

This is based on https://github.com/prometheus/prometheus/pull/1997.

This adds contexts to the relevant Storage methods and already passes
PromQL's new per-query context into the storage's query methods.
The immediate motivation supporting multi-tenancy in Frankenstein, but
this could also be used by Prometheus's normal local storage to support
cancellations and timeouts at some point.

											
										
										
											2016-09-15 15:58:06 -07:00
+									"golang.org/x/net/context"
-												Improve persisting chunks to disk.

This is done by bucketing chunks by fingerprint. If the persisting to
disk falls behind, more and more chunks are in the queue. As soon as
there are "double hits", we will now persist both chunks in one go,
doubling the disk throughput (assuming it is limited by disk
seeks). Should even more pile up so that we end wit "triple hits", we
will persist those first, and so on.

Even if we have millions of time series, this will still help,
assuming not all of them are growing with the same speed. Series that
get many samples and/or are not very compressable will accumulate
chunks faster, and they will soon get double- or triple-writes.

To improve the chance of double writes,
-storage.local.persistence-queue-capacity could be set to a higher
value. However, that will slow down shutdown a lot (as the queue has
to be worked through). So we leave it to the user to set it to a
really high value. A more fundamental solution would be to checkpoint
not only head chunks, but also chunks still in the persist queue. That
would be quite complicated for a rather limited use-case (running many
time series with high ingestion rate on slow spinning disks).

											
										
										
											2015-02-13 11:08:52 -08:00
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+									"github.com/prometheus/prometheus/storage/local/chunk"
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+									"github.com/prometheus/prometheus/storage/metric"
-												Move pkg/ to util/

											
										
										
											2015-05-29 04:30:30 -07:00
+									"github.com/prometheus/prometheus/util/testutil"
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+								)
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+								func TestMatches(t *testing.T) {
-												Switch chunk encoding to type 2 where it was hardcoded type 1 before

The chunk encoding was hardcoded there because it mostly doesn't
matter what encoding is chosen in that test. Since type 1 is
battle-hardened enough, I'm switching to type 2 here so that we can
catch unexpected problems as a byproduct. My expectation is that the
chunk encoding doesn't matter anyway, as said, but then "unexpected
problems" contains the word "unexpected".

											
										
										
											2016-03-20 15:32:20 -07:00
+									storage, closer := NewTestStorage(t, 2)
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+									defer closer.Close()
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+									storage.archiveHighWatermark = 90
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									samples := make([]*model.Sample, 100)
 									fingerprints := make(model.Fingerprints, 100)
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
 									for i := range samples {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										metric := model.Metric{
 											model.MetricNameLabel: model.LabelValue(fmt.Sprintf("test_metric_%d", i)),
 											"label1":              model.LabelValue(fmt.Sprintf("test_%d", i/10)),
 											"label2":              model.LabelValue(fmt.Sprintf("test_%d", (i+5)/10)),
 											"all":                 "const",
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+										}
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										samples[i] = &model.Sample{
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+											Metric:    metric,
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+											Timestamp: model.Time(i),
 											Value:     model.SampleValue(i),
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+										}
-												Use FastFingerprint where appropriate.

											
										
										
											2015-05-05 09:17:51 -07:00
+										fingerprints[i] = metric.FastFingerprint()
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+									}
-												Remove the sample ingestion channel.

The one central sample ingestion channel has caused a variety of
trouble. This commit removes it. Targets and rule evaluation call an
Append method directly now. To incorporate multiple storage backends
(like OpenTSDB), storage.Tee forks the Append into two different
appenders.

Note that the tsdb queue manager had its own queue anyway. It was a
queue after a queue... Much queue, so overhead...

Targets have their own little buffer (implemented as a channel) to
avoid stalling during an http scrape. But a new scrape will only be
started once the old one is fully ingested.

The contraption of three pipelined ingesters was removed. A Target is
an ingester itself now. Despite more logic in Target, things should be
less confusing now.

Also, remove lint and vet warnings in ast.go.

											
										
										
											2015-03-14 19:36:15 -07:00
+									for _, s := range samples {
 										storage.Append(s)
 									}
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+									storage.WaitForIndexing()
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+									// Archive every tenth metric.
 									for i, fp := range fingerprints {
 										if i%10 != 0 {
 											continue
 										}
 										s, ok := storage.fpToSeries.get(fp)
 										if !ok {
 											t.Fatal("could not retrieve series for fp", fp)
 										}
 										storage.fpLocker.Lock(fp)
-												Fix accidental publishing of memorySeries.firstTime()

											
										
										
											2016-09-26 04:06:06 -07:00
+										storage.persistence.archiveMetric(fp, s.metric, s.firstTime(), s.lastTime)
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+										storage.fpLocker.Unlock(fp)
 									}
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									newMatcher := func(matchType metric.MatchType, name model.LabelName, value model.LabelValue) *metric.LabelMatcher {
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+										lm, err := metric.NewLabelMatcher(matchType, name, value)
 										if err != nil {
 											t.Fatalf("error creating label matcher: %s", err)
 										}
 										return lm
 									}
 									var matcherTests = []struct {
 										matchers metric.LabelMatchers
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										expected model.Fingerprints
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+									}{
 										{
 											matchers: metric.LabelMatchers{newMatcher(metric.Equal, "label1", "x")},
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+											expected: model.Fingerprints{},
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+										},
 										{
 											matchers: metric.LabelMatchers{newMatcher(metric.Equal, "label1", "test_0")},
 											expected: fingerprints[:10],
 										},
 										{
 											matchers: metric.LabelMatchers{
 												newMatcher(metric.Equal, "label1", "test_0"),
 												newMatcher(metric.Equal, "label2", "test_1"),
 											},
 											expected: fingerprints[5:10],
 										},
 										{
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+											matchers: metric.LabelMatchers{
 												newMatcher(metric.Equal, "all", "const"),
 												newMatcher(metric.NotEqual, "label1", "x"),
 											},
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+											expected: fingerprints,
 										},
 										{
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+											matchers: metric.LabelMatchers{
 												newMatcher(metric.Equal, "all", "const"),
 												newMatcher(metric.NotEqual, "label1", "test_0"),
 											},
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+											expected: fingerprints[10:],
 										},
 										{
 											matchers: metric.LabelMatchers{
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+												newMatcher(metric.Equal, "all", "const"),
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+												newMatcher(metric.NotEqual, "label1", "test_0"),
 												newMatcher(metric.NotEqual, "label1", "test_1"),
 												newMatcher(metric.NotEqual, "label1", "test_2"),
 											},
 											expected: fingerprints[30:],
 										},
 										{
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+											matchers: metric.LabelMatchers{
 												newMatcher(metric.Equal, "label1", ""),
 											},
 											expected: fingerprints[:0],
 										},
 										{
 											matchers: metric.LabelMatchers{
 												newMatcher(metric.NotEqual, "label1", "test_0"),
 												newMatcher(metric.Equal, "label1", ""),
 											},
 											expected: fingerprints[:0],
 										},
 										{
 											matchers: metric.LabelMatchers{
 												newMatcher(metric.NotEqual, "label1", "test_0"),
 												newMatcher(metric.Equal, "label2", ""),
 											},
 											expected: fingerprints[:0],
 										},
 										{
 											matchers: metric.LabelMatchers{
 												newMatcher(metric.Equal, "all", "const"),
 												newMatcher(metric.NotEqual, "label1", "test_0"),
-												Fix common english misspellings

											
										
										
											2016-09-14 20:23:28 -07:00
+												newMatcher(metric.Equal, "not_existent", ""),
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+											},
 											expected: fingerprints[10:],
 										},
 										{
 											matchers: metric.LabelMatchers{
 												newMatcher(metric.RegexMatch, "label1", `test_[3-5]`),
 											},
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+											expected: fingerprints[30:60],
 										},
 										{
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+											matchers: metric.LabelMatchers{
 												newMatcher(metric.Equal, "all", "const"),
 												newMatcher(metric.RegexNoMatch, "label1", `test_[3-5]`),
 											},
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+											expected: append(append(model.Fingerprints{}, fingerprints[:30]...), fingerprints[60:]...),
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+										},
 										{
 											matchers: metric.LabelMatchers{
 												newMatcher(metric.RegexMatch, "label1", `test_[3-5]`),
 												newMatcher(metric.RegexMatch, "label2", `test_[4-6]`),
 											},
 											expected: fingerprints[35:60],
 										},
 										{
 											matchers: metric.LabelMatchers{
 												newMatcher(metric.RegexMatch, "label1", `test_[3-5]`),
 												newMatcher(metric.NotEqual, "label2", `test_4`),
 											},
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+											expected: append(append(model.Fingerprints{}, fingerprints[30:35]...), fingerprints[45:60]...),
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+										},
-												Only do regex lookups when there was no equality match.

For the label matching index-based preselection phase, don't do an OR
between equality and non-equality matchers. Execute only one of the two
(with equality matchers preferred when present).

Fixes https://github.com/prometheus/prometheus/issues/924

											
										
										
											2015-07-23 13:46:13 -07:00
+										{
 											matchers: metric.LabelMatchers{
 												newMatcher(metric.Equal, "label1", `nonexistent`),
 												newMatcher(metric.RegexMatch, "label2", `test`),
 											},
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+											expected: model.Fingerprints{},
-												Only do regex lookups when there was no equality match.

For the label matching index-based preselection phase, don't do an OR
between equality and non-equality matchers. Execute only one of the two
(with equality matchers preferred when present).

Fixes https://github.com/prometheus/prometheus/issues/924

											
										
										
											2015-07-23 13:46:13 -07:00
+										},
 										{
 											matchers: metric.LabelMatchers{
 												newMatcher(metric.Equal, "label1", `test_0`),
 												newMatcher(metric.RegexMatch, "label2", `nonexistent`),
 											},
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+											expected: model.Fingerprints{},
-												Only do regex lookups when there was no equality match.

For the label matching index-based preselection phase, don't do an OR
between equality and non-equality matchers. Execute only one of the two
(with equality matchers preferred when present).

Fixes https://github.com/prometheus/prometheus/issues/924

											
										
										
											2015-07-23 13:46:13 -07:00
+										},
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+									}
 									for _, mt := range matcherTests {
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+										metrics, err := storage.MetricsForLabelMatchers(
-												storage: Contextify storage interfaces.

This is based on https://github.com/prometheus/prometheus/pull/1997.

This adds contexts to the relevant Storage methods and already passes
PromQL's new per-query context into the storage's query methods.
The immediate motivation supporting multi-tenancy in Frankenstein, but
this could also be used by Prometheus's normal local storage to support
cancellations and timeouts at some point.

											
										
										
											2016-09-15 15:58:06 -07:00
+											context.Background(),
-												Improve MetricsForLabelMatchers

WIP: This needs more tests.

It now gets a from and through value, which it may opportunistically
use to optimize the retrieval. With possible future range indices,
this could be used in a very efficient way. This change merely applies
some easy checks, which should nevertheless solve the use case of
heavy rule evaluations on servers with a lot of series churn.

Idea is the following:

- Only archive series that are at least as old as the headChunkTimeout
  (which was already extremely unlikely to happen).

- Then maintain a high watermark for the last archival, i.e. no
  archived series has a sample more recent than that watermark.

- Any query that doesn't reach to a time before that watermark doesn't
  have to touch the archive index at all. (A production server at
  Soundcloud with the aforementioned series churn and heavy rule
  evaluations spends 50% of its CPU time in archive index
  lookups. Since rule evaluations usually only touch very recent
  values, most of those lookup should disappear with this change.)

- Federation with a very broad label matcher will profit from this,
  too.

As a byproduct, the un-needed MetricForFingerprint method was removed
from the Storage interface.

											
										
										
											2016-03-08 15:09:42 -08:00
+											model.Earliest, model.Latest,
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+											mt.matchers,
-												Improve MetricsForLabelMatchers

WIP: This needs more tests.

It now gets a from and through value, which it may opportunistically
use to optimize the retrieval. With possible future range indices,
this could be used in a very efficient way. This change merely applies
some easy checks, which should nevertheless solve the use case of
heavy rule evaluations on servers with a lot of series churn.

Idea is the following:

- Only archive series that are at least as old as the headChunkTimeout
  (which was already extremely unlikely to happen).

- Then maintain a high watermark for the last archival, i.e. no
  archived series has a sample more recent than that watermark.

- Any query that doesn't reach to a time before that watermark doesn't
  have to touch the archive index at all. (A production server at
  Soundcloud with the aforementioned series churn and heavy rule
  evaluations spends 50% of its CPU time in archive index
  lookups. Since rule evaluations usually only touch very recent
  values, most of those lookup should disappear with this change.)

- Federation with a very broad label matcher will profit from this,
  too.

As a byproduct, the un-needed MetricForFingerprint method was removed
from the Storage interface.

											
										
										
											2016-03-08 15:09:42 -08:00
+										)
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+										if err != nil {
 											t.Fatal(err)
 										}
 										if len(mt.expected) != len(metrics) {
 											t.Fatalf("expected %d matches for %q, found %d", len(mt.expected), mt.matchers, len(metrics))
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+										}
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+										for _, m := range metrics {
 											fp1 := m.Metric.FastFingerprint()
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+											found := false
 											for _, fp2 := range mt.expected {
 												if fp1 == fp2 {
 													found = true
 													break
 												}
 											}
 											if !found {
 												t.Errorf("expected fingerprint %s for %q not in result", fp1, mt.matchers)
 											}
 										}
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+										// Smoketest for from/through.
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+										metrics, err = storage.MetricsForLabelMatchers(
-												storage: Contextify storage interfaces.

This is based on https://github.com/prometheus/prometheus/pull/1997.

This adds contexts to the relevant Storage methods and already passes
PromQL's new per-query context into the storage's query methods.
The immediate motivation supporting multi-tenancy in Frankenstein, but
this could also be used by Prometheus's normal local storage to support
cancellations and timeouts at some point.

											
										
										
											2016-09-15 15:58:06 -07:00
+											context.Background(),
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+											model.Earliest, -10000,
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+											mt.matchers,
 										)
 										if err != nil {
 											t.Fatal(err)
 										}
 										if len(metrics) > 0 {
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+											t.Error("expected no matches with 'through' older than any sample")
 										}
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+										metrics, err = storage.MetricsForLabelMatchers(
-												storage: Contextify storage interfaces.

This is based on https://github.com/prometheus/prometheus/pull/1997.

This adds contexts to the relevant Storage methods and already passes
PromQL's new per-query context into the storage's query methods.
The immediate motivation supporting multi-tenancy in Frankenstein, but
this could also be used by Prometheus's normal local storage to support
cancellations and timeouts at some point.

											
										
										
											2016-09-15 15:58:06 -07:00
+											context.Background(),
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+, model.Latest,
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+											mt.matchers,
 										)
 										if err != nil {
 											t.Fatal(err)
 										}
 										if len(metrics) > 0 {
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+											t.Error("expected no matches with 'from' newer than any sample")
 										}
 										// Now the tricky one, cut out something from the middle.
 										var (
 											from    model.Time = 25
 											through model.Time = 75
 										)
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+										metrics, err = storage.MetricsForLabelMatchers(
-												storage: Contextify storage interfaces.

This is based on https://github.com/prometheus/prometheus/pull/1997.

This adds contexts to the relevant Storage methods and already passes
PromQL's new per-query context into the storage's query methods.
The immediate motivation supporting multi-tenancy in Frankenstein, but
this could also be used by Prometheus's normal local storage to support
cancellations and timeouts at some point.

											
										
										
											2016-09-15 15:58:06 -07:00
+											context.Background(),
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+											from, through,
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+											mt.matchers,
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+										)
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+										if err != nil {
 											t.Fatal(err)
 										}
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+										expected := model.Fingerprints{}
 										for _, fp := range mt.expected {
 											i := 0
 											for ; fingerprints[i] != fp && i < len(fingerprints); i++ {
 											}
 											if i == len(fingerprints) {
 												t.Fatal("expected fingerprint does not exist")
 											}
 											if !model.Time(i).Before(from) && !model.Time(i).After(through) {
 												expected = append(expected, fp)
 											}
 										}
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+										if len(expected) != len(metrics) {
 											t.Errorf("expected %d range-limited matches for %q, found %d", len(expected), mt.matchers, len(metrics))
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+										}
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+										for _, m := range metrics {
 											fp1 := m.Metric.FastFingerprint()
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+											found := false
 											for _, fp2 := range expected {
 												if fp1 == fp2 {
 													found = true
 													break
 												}
 											}
 											if !found {
 												t.Errorf("expected fingerprint %s for %q not in range-limited result", fp1, mt.matchers)
 											}
 										}
-												Add tests for retrieving fingerprints for label matchers.

This checks for the basic behaviour of GetFingerprintsForLabelMatchers, that is, whether the different matcher types filter the correct fingerprints and intersections are correct.

											
										
										
											2015-02-27 05:41:43 -08:00
+									}
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+								}
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+								func TestFingerprintsForLabels(t *testing.T) {
-												Switch chunk encoding to type 2 where it was hardcoded type 1 before

The chunk encoding was hardcoded there because it mostly doesn't
matter what encoding is chosen in that test. Since type 1 is
battle-hardened enough, I'm switching to type 2 here so that we can
catch unexpected problems as a byproduct. My expectation is that the
chunk encoding doesn't matter anyway, as said, but then "unexpected
problems" contains the word "unexpected".

											
										
										
											2016-03-20 15:32:20 -07:00
+									storage, closer := NewTestStorage(t, 2)
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+									defer closer.Close()
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									samples := make([]*model.Sample, 100)
 									fingerprints := make(model.Fingerprints, 100)
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
 									for i := range samples {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										metric := model.Metric{
 											model.MetricNameLabel: model.LabelValue(fmt.Sprintf("test_metric_%d", i)),
 											"label1":              model.LabelValue(fmt.Sprintf("test_%d", i/10)),
 											"label2":              model.LabelValue(fmt.Sprintf("test_%d", (i+5)/10)),
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+										}
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										samples[i] = &model.Sample{
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+											Metric:    metric,
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+											Timestamp: model.Time(i),
 											Value:     model.SampleValue(i),
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+										}
 										fingerprints[i] = metric.FastFingerprint()
 									}
 									for _, s := range samples {
 										storage.Append(s)
 									}
 									storage.WaitForIndexing()
 									var matcherTests = []struct {
-												Replace metric.LabelPair with model.LabelPair

											
										
										
											2015-08-22 04:32:13 -07:00
+										pairs    []model.LabelPair
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										expected model.Fingerprints
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+									}{
 										{
-												storage: Make tests go-vet and golint clean

											
										
										
											2016-12-13 07:57:49 -08:00
+											pairs:    []model.LabelPair{{Name: "label1", Value: "x"}},
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+											expected: fingerprints[:0],
 										},
 										{
-												storage: Make tests go-vet and golint clean

											
										
										
											2016-12-13 07:57:49 -08:00
+											pairs:    []model.LabelPair{{Name: "label1", Value: "test_0"}},
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+											expected: fingerprints[:10],
 										},
 										{
-												Replace metric.LabelPair with model.LabelPair

											
										
										
											2015-08-22 04:32:13 -07:00
+											pairs: []model.LabelPair{
-												storage: Make tests go-vet and golint clean

											
										
										
											2016-12-13 07:57:49 -08:00
+												{Name: "label1", Value: "test_0"},
 												{Name: "label1", Value: "test_1"},
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+											},
 											expected: fingerprints[:0],
 										},
 										{
-												Replace metric.LabelPair with model.LabelPair

											
										
										
											2015-08-22 04:32:13 -07:00
+											pairs: []model.LabelPair{
-												storage: Make tests go-vet and golint clean

											
										
										
											2016-12-13 07:57:49 -08:00
+												{Name: "label1", Value: "test_0"},
 												{Name: "label2", Value: "test_1"},
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+											},
 											expected: fingerprints[5:10],
 										},
 										{
-												Replace metric.LabelPair with model.LabelPair

											
										
										
											2015-08-22 04:32:13 -07:00
+											pairs: []model.LabelPair{
-												storage: Make tests go-vet and golint clean

											
										
										
											2016-12-13 07:57:49 -08:00
+												{Name: "label1", Value: "test_1"},
 												{Name: "label2", Value: "test_2"},
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+											},
 											expected: fingerprints[15:20],
 										},
 									}
 									for _, mt := range matcherTests {
-												storage: improve index lookups

tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.

In more detail:

Imagine the following query:

    nicely:aggregating:rule{job="foo",env="prod"}

While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:

    nicely:aggregating:rule{job=~"foo",env=~"prod"}

Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.

This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:

- First, sort all matchers by "expected cardinality". Matchers
  matching the empty string are always worst (and never used for
  intersections). Equal matchers are in general consider best, but by
  using some crude heuristics, we declare some better than others
  (instance labels or anything that looks like a recording rule).

- Then go through the matchers until we hit a threshold of remaining
  FPs in the intersection. This threshold is higher if we are already
  in the non-Equal matcher area as intersection is even more expensive
  here.

- Once the threshold has been reached (or we have run out of matchers
  that do not match the empty string), start with "retrieve metrics
  and check them individually against remaining matchers".

A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).

											
										
										
											2016-06-28 11:18:32 -07:00
+										var resfps map[model.Fingerprint]struct{}
 										for _, pair := range mt.pairs {
 											resfps = storage.fingerprintsForLabelPair(pair, nil, resfps)
 										}
-												storage: improve label matching and allow unset matching.

Matching of empty labels now also matches metrics where the label
was not explicitly set to the empty string.

											
										
										
											2015-06-15 09:25:31 -07:00
+										if len(mt.expected) != len(resfps) {
 											t.Fatalf("expected %d matches for %q, found %d", len(mt.expected), mt.pairs, len(resfps))
 										}
 										for fp1 := range resfps {
 											found := false
 											for _, fp2 := range mt.expected {
 												if fp1 == fp2 {
 													found = true
 													break
 												}
 											}
 											if !found {
 												t.Errorf("expected fingerprint %s for %q not in result", fp1, mt.pairs)
 											}
 										}
 									}
 								}
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+								var benchLabelMatchingRes []metric.Metric
-												storage/local: add benchmark for label matching.

											
										
										
											2015-06-22 03:02:03 -07:00
 								func BenchmarkLabelMatching(b *testing.B) {
-												Switch chunk encoding to type 2 where it was hardcoded type 1 before

The chunk encoding was hardcoded there because it mostly doesn't
matter what encoding is chosen in that test. Since type 1 is
battle-hardened enough, I'm switching to type 2 here so that we can
catch unexpected problems as a byproduct. My expectation is that the
chunk encoding doesn't matter anyway, as said, but then "unexpected
problems" contains the word "unexpected".

											
										
										
											2016-03-20 15:32:20 -07:00
+									s, closer := NewTestStorage(b, 2)
-												storage/local: add benchmark for label matching.

											
										
										
											2015-06-22 03:02:03 -07:00
+									defer closer.Close()
 									h := fnv.New64a()
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									lbl := func(x int) model.LabelValue {
-												storage/local: add benchmark for label matching.

											
										
										
											2015-06-22 03:02:03 -07:00
+										h.Reset()
 										h.Write([]byte(fmt.Sprintf("%d", x)))
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										return model.LabelValue(fmt.Sprintf("%d", h.Sum64()))
-												storage/local: add benchmark for label matching.

											
										
										
											2015-06-22 03:02:03 -07:00
+									}
 									M := 32
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									met := model.Metric{}
-												storage/local: add benchmark for label matching.

											
										
										
											2015-06-22 03:02:03 -07:00
+									for i := 0; i < M; i++ {
 										met["label_a"] = lbl(i)
 										for j := 0; j < M; j++ {
 											met["label_b"] = lbl(j)
 											for k := 0; k < M; k++ {
 												met["label_c"] = lbl(k)
 												for l := 0; l < M; l++ {
 													met["label_d"] = lbl(l)
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+													s.Append(&model.Sample{
-												storage/local: add benchmark for label matching.

											
										
										
											2015-06-22 03:02:03 -07:00
+														Metric:    met.Clone(),
 														Timestamp: 0,
 														Value:     1,
 													})
 												}
 											}
 										}
 									}
 									s.WaitForIndexing()
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									newMatcher := func(matchType metric.MatchType, name model.LabelName, value model.LabelValue) *metric.LabelMatcher {
-												storage/local: add benchmark for label matching.

											
										
										
											2015-06-22 03:02:03 -07:00
+										lm, err := metric.NewLabelMatcher(matchType, name, value)
 										if err != nil {
 											b.Fatalf("error creating label matcher: %s", err)
 										}
 										return lm
 									}
 									var matcherTests = []metric.LabelMatchers{
 										{
 											newMatcher(metric.Equal, "label_a", lbl(1)),
 										},
 										{
 											newMatcher(metric.Equal, "label_a", lbl(3)),
 											newMatcher(metric.Equal, "label_c", lbl(3)),
 										},
 										{
 											newMatcher(metric.Equal, "label_a", lbl(3)),
 											newMatcher(metric.Equal, "label_c", lbl(3)),
 											newMatcher(metric.NotEqual, "label_d", lbl(3)),
 										},
 										{
 											newMatcher(metric.Equal, "label_a", lbl(3)),
 											newMatcher(metric.Equal, "label_b", lbl(3)),
 											newMatcher(metric.Equal, "label_c", lbl(3)),
 											newMatcher(metric.NotEqual, "label_d", lbl(3)),
 										},
 										{
 											newMatcher(metric.RegexMatch, "label_a", ".+"),
 										},
 										{
 											newMatcher(metric.Equal, "label_a", lbl(3)),
 											newMatcher(metric.RegexMatch, "label_a", ".+"),
 										},
 										{
 											newMatcher(metric.Equal, "label_a", lbl(1)),
 											newMatcher(metric.RegexMatch, "label_c", "("+lbl(3)+"|"+lbl(10)+")"),
 										},
 										{
 											newMatcher(metric.Equal, "label_a", lbl(3)),
 											newMatcher(metric.Equal, "label_a", lbl(4)),
 											newMatcher(metric.RegexMatch, "label_c", "("+lbl(3)+"|"+lbl(10)+")"),
 										},
 									}
 									b.ReportAllocs()
 									b.ResetTimer()
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+									var err error
-												storage/local: add benchmark for label matching.

											
										
										
											2015-06-22 03:02:03 -07:00
+									for i := 0; i < b.N; i++ {
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+										benchLabelMatchingRes = []metric.Metric{}
-												storage/local: add benchmark for label matching.

											
										
										
											2015-06-22 03:02:03 -07:00
+										for _, mt := range matcherTests {
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+											benchLabelMatchingRes, err = s.MetricsForLabelMatchers(
-												storage: Contextify storage interfaces.

This is based on https://github.com/prometheus/prometheus/pull/1997.

This adds contexts to the relevant Storage methods and already passes
PromQL's new per-query context into the storage's query methods.
The immediate motivation supporting multi-tenancy in Frankenstein, but
this could also be used by Prometheus's normal local storage to support
cancellations and timeouts at some point.

											
										
										
											2016-09-15 15:58:06 -07:00
+												context.Background(),
-												Improve MetricsForLabelMatchers

WIP: This needs more tests.

It now gets a from and through value, which it may opportunistically
use to optimize the retrieval. With possible future range indices,
this could be used in a very efficient way. This change merely applies
some easy checks, which should nevertheless solve the use case of
heavy rule evaluations on servers with a lot of series churn.

Idea is the following:

- Only archive series that are at least as old as the headChunkTimeout
  (which was already extremely unlikely to happen).

- Then maintain a high watermark for the last archival, i.e. no
  archived series has a sample more recent than that watermark.

- Any query that doesn't reach to a time before that watermark doesn't
  have to touch the archive index at all. (A production server at
  Soundcloud with the aforementioned series churn and heavy rule
  evaluations spends 50% of its CPU time in archive index
  lookups. Since rule evaluations usually only touch very recent
  values, most of those lookup should disappear with this change.)

- Federation with a very broad label matcher will profit from this,
  too.

As a byproduct, the un-needed MetricForFingerprint method was removed
from the Storage interface.

											
										
										
											2016-03-08 15:09:42 -08:00
+												model.Earliest, model.Latest,
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+												mt,
-												Improve MetricsForLabelMatchers

WIP: This needs more tests.

It now gets a from and through value, which it may opportunistically
use to optimize the retrieval. With possible future range indices,
this could be used in a very efficient way. This change merely applies
some easy checks, which should nevertheless solve the use case of
heavy rule evaluations on servers with a lot of series churn.

Idea is the following:

- Only archive series that are at least as old as the headChunkTimeout
  (which was already extremely unlikely to happen).

- Then maintain a high watermark for the last archival, i.e. no
  archived series has a sample more recent than that watermark.

- Any query that doesn't reach to a time before that watermark doesn't
  have to touch the archive index at all. (A production server at
  Soundcloud with the aforementioned series churn and heavy rule
  evaluations spends 50% of its CPU time in archive index
  lookups. Since rule evaluations usually only touch very recent
  values, most of those lookup should disappear with this change.)

- Federation with a very broad label matcher will profit from this,
  too.

As a byproduct, the un-needed MetricForFingerprint method was removed
from the Storage interface.

											
										
										
											2016-03-08 15:09:42 -08:00
+											)
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+											if err != nil {
 												b.Fatal(err)
 											}
-												storage/local: add benchmark for label matching.

											
										
										
											2015-06-22 03:02:03 -07:00
+										}
 									}
 									// Stop timer to not count the storage closing.
 									b.StopTimer()
 								}
-												Added BenchmarkQueryRange

											
										
										
											2016-09-18 14:58:39 -07:00
+								func BenchmarkQueryRange(b *testing.B) {
 									now := model.Now()
 									insertStart := now.Add(-2 * time.Hour)
 									s, closer := NewTestStorage(b, 2)
 									defer closer.Close()
 									// Stop maintenance loop to prevent actual purging.
 									close(s.loopStopping)
 									<-s.loopStopped
 									<-s.logThrottlingStopped
 									// Recreate channel to avoid panic when we really shut down.
 									s.loopStopping = make(chan struct{})
 									for i := 0; i < 8192; i++ {
 										s.Append(&model.Sample{
 											Metric:    model.Metric{"__name__": model.LabelValue(strconv.Itoa(i)), "job": "test"},
 											Timestamp: insertStart,
 											Value:     1,
 										})
 									}
 									s.WaitForIndexing()
 									b.ResetTimer()
 									b.RunParallel(func(pb *testing.PB) {
 										lm, _ := metric.NewLabelMatcher(metric.Equal, "job", "test")
 										for pb.Next() {
 											s.QueryRange(context.Background(), insertStart, now, lm)
 										}
 									})
 								}
-												Protect exported Querier interface method against negative time ranges

											
										
										
											2016-11-01 07:05:01 -07:00
+								func TestQueryRangeThroughBeforeFrom(t *testing.T) {
 									now := model.Now()
 									insertStart := now.Add(-2 * time.Hour)
 									s, closer := NewTestStorage(t, 2)
 									defer closer.Close()
 									// Stop maintenance loop to prevent actual purging.
 									close(s.loopStopping)
 									<-s.loopStopped
 									<-s.logThrottlingStopped
 									// Recreate channel to avoid panic when we really shut down.
 									s.loopStopping = make(chan struct{})
 									for i := 0; i < 8192; i++ {
 										s.Append(&model.Sample{
 											Metric:    model.Metric{"__name__": "testmetric", "job": "test"},
 											Timestamp: insertStart.Add(time.Duration(i) * time.Second),
 											Value:     model.SampleValue(rand.Float64()),
 										})
 									}
 									s.WaitForIndexing()
 									lm, _ := metric.NewLabelMatcher(metric.Equal, "job", "test")
 									iters, err := s.QueryRange(context.Background(), now.Add(-30*time.Minute), now.Add(-90*time.Minute), lm)
 									if err != nil {
 										t.Error(err)
 									}
 									if len(iters) != 0 {
 										t.Errorf("expected no iters to be returned, got %d", len(iters))
 									}
 								}
-												Limit retrievable samples to retention window.

The storage does not delete data immediately after the retention period.
We don't want to retrieve this data as it causes artifacts.

											
										
										
											2015-05-27 02:24:56 -07:00
+								func TestRetentionCutoff(t *testing.T) {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									now := model.Now()
-												Limit retrievable samples to retention window.

The storage does not delete data immediately after the retention period.
We don't want to retrieve this data as it causes artifacts.

											
										
										
											2015-05-27 02:24:56 -07:00
+									insertStart := now.Add(-2 * time.Hour)
-												Switch chunk encoding to type 2 where it was hardcoded type 1 before

The chunk encoding was hardcoded there because it mostly doesn't
matter what encoding is chosen in that test. Since type 1 is
battle-hardened enough, I'm switching to type 2 here so that we can
catch unexpected problems as a byproduct. My expectation is that the
chunk encoding doesn't matter anyway, as said, but then "unexpected
problems" contains the word "unexpected".

											
										
										
											2016-03-20 15:32:20 -07:00
+									s, closer := NewTestStorage(t, 2)
-												Limit retrievable samples to retention window.

The storage does not delete data immediately after the retention period.
We don't want to retrieve this data as it causes artifacts.

											
										
										
											2015-05-27 02:24:56 -07:00
+									defer closer.Close()
 									// Stop maintenance loop to prevent actual purging.
-												Remove a race condition from TestRetentionCutoff

											
										
										
											2016-01-25 07:36:14 -08:00
+									close(s.loopStopping)
 									<-s.loopStopped
-												Remove race condition from TestRetentionCutoff

											
										
										
											2016-02-11 16:46:18 -08:00
+									<-s.logThrottlingStopped
-												Remove a race condition from TestRetentionCutoff

											
										
										
											2016-01-25 07:36:14 -08:00
+									// Recreate channel to avoid panic when we really shut down.
 									s.loopStopping = make(chan struct{})
-												Limit retrievable samples to retention window.

The storage does not delete data immediately after the retention period.
We don't want to retrieve this data as it causes artifacts.

											
										
										
											2015-05-27 02:24:56 -07:00
 									s.dropAfter = 1 * time.Hour
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									for i := 0; i < 120; i++ {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										smpl := &model.Sample{
 											Metric:    model.Metric{"job": "test"},
-												Limit retrievable samples to retention window.

The storage does not delete data immediately after the retention period.
We don't want to retrieve this data as it causes artifacts.

											
										
										
											2015-05-27 02:24:56 -07:00
+											Timestamp: insertStart.Add(time.Duration(i) * time.Minute), // 1 minute intervals.
 											Value:     1,
 										}
 										s.Append(smpl)
 									}
 									s.WaitForIndexing()
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+									lm, err := metric.NewLabelMatcher(metric.Equal, "job", "test")
 									if err != nil {
 										t.Fatalf("error creating label matcher: %s", err)
 									}
-												storage: Contextify storage interfaces.

This is based on https://github.com/prometheus/prometheus/pull/1997.

This adds contexts to the relevant Storage methods and already passes
PromQL's new per-query context into the storage's query methods.
The immediate motivation supporting multi-tenancy in Frankenstein, but
this could also be used by Prometheus's normal local storage to support
cancellations and timeouts at some point.

											
										
										
											2016-09-15 15:58:06 -07:00
+									its, err := s.QueryRange(context.Background(), insertStart, now, lm)
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+									if err != nil {
 										t.Fatal(err)
-												Limit retrievable samples to retention window.

The storage does not delete data immediately after the retention period.
We don't want to retrieve this data as it causes artifacts.

											
										
										
											2015-05-27 02:24:56 -07:00
+									}
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+									if len(its) != 1 {
 										t.Fatalf("expected one iterator but got %d", len(its))
 									}
-												Limit retrievable samples to retention window.

The storage does not delete data immediately after the retention period.
We don't want to retrieve this data as it causes artifacts.

											
										
										
											2015-05-27 02:24:56 -07:00
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+									val := its[0].ValueAtOrBeforeTime(now.Add(-61 * time.Minute))
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+									if val.Timestamp != model.Earliest {
-												Limit retrievable samples to retention window.

The storage does not delete data immediately after the retention period.
We don't want to retrieve this data as it causes artifacts.

											
										
										
											2015-05-27 02:24:56 -07:00
+										t.Errorf("unexpected result for timestamp before retention period")
 									}
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+									vals := its[0].RangeValues(metric.Interval{OldestInclusive: insertStart, NewestInclusive: now})
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									// We get 59 values here because the model.Now() is slightly later
-												Limit retrievable samples to retention window.

The storage does not delete data immediately after the retention period.
We don't want to retrieve this data as it causes artifacts.

											
										
										
											2015-05-27 02:24:56 -07:00
+									// than our now.
 									if len(vals) != 59 {
 										t.Errorf("expected 59 values but got %d", len(vals))
 									}
 									if expt := now.Add(-1 * time.Hour).Add(time.Minute); vals[0].Timestamp != expt {
 										t.Errorf("unexpected timestamp for first sample: %v, expected %v", vals[0].Timestamp.Time(), expt.Time())
 									}
 								}
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+								func TestDropMetrics(t *testing.T) {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									now := model.Now()
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									insertStart := now.Add(-2 * time.Hour)
-												Switch chunk encoding to type 2 where it was hardcoded type 1 before

The chunk encoding was hardcoded there because it mostly doesn't
matter what encoding is chosen in that test. Since type 1 is
battle-hardened enough, I'm switching to type 2 here so that we can
catch unexpected problems as a byproduct. My expectation is that the
chunk encoding doesn't matter anyway, as said, but then "unexpected
problems" contains the word "unexpected".

											
										
										
											2016-03-20 15:32:20 -07:00
+									s, closer := NewTestStorage(t, 2)
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									defer closer.Close()
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+									chunkFileExists := func(fp model.Fingerprint) (bool, error) {
 										f, err := s.persistence.openChunkFileForReading(fp)
 										if err == nil {
 											f.Close()
 											return true, nil
 										}
 										if os.IsNotExist(err) {
 											return false, nil
 										}
 										return false, err
 									}
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									m1 := model.Metric{model.MetricNameLabel: "test", "n1": "v1"}
 									m2 := model.Metric{model.MetricNameLabel: "test", "n1": "v2"}
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+									m3 := model.Metric{model.MetricNameLabel: "test", "n1": "v3"}
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+									lm1, err := metric.NewLabelMatcher(metric.Equal, "n1", "v1")
 									if err != nil {
 										t.Fatal(err)
 									}
 									lmAll, err := metric.NewLabelMatcher(metric.Equal, model.MetricNameLabel, "test")
 									if err != nil {
 										t.Fatal(err)
 									}
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									N := 120000
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+									for j, m := range []model.Metric{m1, m2, m3} {
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+										for i := 0; i < N; i++ {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+											smpl := &model.Sample{
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+												Metric:    m,
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+												Timestamp: insertStart.Add(time.Duration(i) * time.Millisecond), // 1 millisecond intervals.
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+												Value:     model.SampleValue(j),
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+											}
 											s.Append(smpl)
 										}
 									}
 									s.WaitForIndexing()
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+									// Archive m3, but first maintain it so that at least something is written to disk.
 									fpToBeArchived := m3.FastFingerprint()
 									s.maintainMemorySeries(fpToBeArchived, 0)
 									s.fpLocker.Lock(fpToBeArchived)
 									s.fpToSeries.del(fpToBeArchived)
-												Clean up error propagation

Only return an error where callers are doing something with it except
simply logging and ignoring.

All the errors touched in this commit flag the storage as dirty
anyway, and that fact is logged anyway. So most of what is being
removed here is just log spam.

As discussed earlier, the class of errors that flags the storage as
dirty signals fundamental corruption, no even bubbling up a one-time
warning to the user (e.g. about incomplete results) isn't helping much
because _anything_ happening in the storage has to be doubted from
that point on (and in fact retroactively into the past, too). Flagging
the storage dirty, and alerting on it (plus marking the state in the
web UI) is the only way I can see right now.

As a byproduct, I cleaned up the setDirty method a bit and improved
the logged errors.

											
										
										
											2016-03-09 09:56:30 -08:00
+									s.persistence.archiveMetric(fpToBeArchived, m3, 0, insertStart.Add(time.Duration(N-1)*time.Millisecond))
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+									s.fpLocker.Unlock(fpToBeArchived)
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
-												storage: improve index lookups

tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.

In more detail:

Imagine the following query:

    nicely:aggregating:rule{job="foo",env="prod"}

While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:

    nicely:aggregating:rule{job=~"foo",env=~"prod"}

Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.

This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:

- First, sort all matchers by "expected cardinality". Matchers
  matching the empty string are always worst (and never used for
  intersections). Equal matchers are in general consider best, but by
  using some crude heuristics, we declare some better than others
  (instance labels or anything that looks like a recording rule).

- Then go through the matchers until we hit a threshold of remaining
  FPs in the intersection. This threshold is higher if we are already
  in the non-Equal matcher area as intersection is even more expensive
  here.

- Once the threshold has been reached (or we have run out of matchers
  that do not match the empty string), start with "retrieve metrics
  and check them individually against remaining matchers".

A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).

											
										
										
											2016-06-28 11:18:32 -07:00
+									fps := s.fingerprintsForLabelPair(model.LabelPair{
 										Name: model.MetricNameLabel, Value: "test",
 									}, nil, nil)
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+									if len(fps) != 3 {
 										t.Errorf("unexpected number of fingerprints: %d", len(fps))
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									}
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+									fpList := model.Fingerprints{m1.FastFingerprint(), m2.FastFingerprint(), fpToBeArchived}
-												storage: Contextify storage interfaces.

This is based on https://github.com/prometheus/prometheus/pull/1997.

This adds contexts to the relevant Storage methods and already passes
PromQL's new per-query context into the storage's query methods.
The immediate motivation supporting multi-tenancy in Frankenstein, but
this could also be used by Prometheus's normal local storage to support
cancellations and timeouts at some point.

											
										
										
											2016-09-15 15:58:06 -07:00
+									n, err := s.DropMetricsForLabelMatchers(context.Background(), lm1)
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+									if err != nil {
 										t.Fatal(err)
 									}
 									if n != 1 {
 										t.Fatalf("expected 1 series to be dropped, got %d", n)
 									}
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									s.WaitForIndexing()
-												storage: improve index lookups

tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.

In more detail:

Imagine the following query:

    nicely:aggregating:rule{job="foo",env="prod"}

While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:

    nicely:aggregating:rule{job=~"foo",env=~"prod"}

Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.

This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:

- First, sort all matchers by "expected cardinality". Matchers
  matching the empty string are always worst (and never used for
  intersections). Equal matchers are in general consider best, but by
  using some crude heuristics, we declare some better than others
  (instance labels or anything that looks like a recording rule).

- Then go through the matchers until we hit a threshold of remaining
  FPs in the intersection. This threshold is higher if we are already
  in the non-Equal matcher area as intersection is even more expensive
  here.

- Once the threshold has been reached (or we have run out of matchers
  that do not match the empty string), start with "retrieve metrics
  and check them individually against remaining matchers".

A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).

											
										
										
											2016-06-28 11:18:32 -07:00
+									fps2 := s.fingerprintsForLabelPair(model.LabelPair{
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										Name: model.MetricNameLabel, Value: "test",
-												storage: improve index lookups

tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.

In more detail:

Imagine the following query:

    nicely:aggregating:rule{job="foo",env="prod"}

While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:

    nicely:aggregating:rule{job=~"foo",env=~"prod"}

Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.

This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:

- First, sort all matchers by "expected cardinality". Matchers
  matching the empty string are always worst (and never used for
  intersections). Equal matchers are in general consider best, but by
  using some crude heuristics, we declare some better than others
  (instance labels or anything that looks like a recording rule).

- Then go through the matchers until we hit a threshold of remaining
  FPs in the intersection. This threshold is higher if we are already
  in the non-Equal matcher area as intersection is even more expensive
  here.

- Once the threshold has been reached (or we have run out of matchers
  that do not match the empty string), start with "retrieve metrics
  and check them individually against remaining matchers".

A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).

											
										
										
											2016-06-28 11:18:32 -07:00
+									}, nil, nil)
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+									if len(fps2) != 2 {
 										t.Errorf("unexpected number of fingerprints: %d", len(fps2))
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									}
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+									it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fpList[0]), model.Earliest, model.Latest)
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
+									if vals := it.RangeValues(metric.Interval{OldestInclusive: insertStart, NewestInclusive: now}); len(vals) != 0 {
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+										t.Errorf("unexpected number of samples: %d", len(vals))
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									}
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+									it = s.preloadChunksForRange(makeFingerprintSeriesPair(s, fpList[1]), model.Earliest, model.Latest)
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
+									if vals := it.RangeValues(metric.Interval{OldestInclusive: insertStart, NewestInclusive: now}); len(vals) != N {
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+										t.Errorf("unexpected number of samples: %d", len(vals))
 									}
 									exists, err := chunkFileExists(fpList[2])
 									if err != nil {
 										t.Fatal(err)
 									}
 									if !exists {
 										t.Errorf("chunk file does not exist for fp=%v", fpList[2])
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									}
-												storage: Contextify storage interfaces.

This is based on https://github.com/prometheus/prometheus/pull/1997.

This adds contexts to the relevant Storage methods and already passes
PromQL's new per-query context into the storage's query methods.
The immediate motivation supporting multi-tenancy in Frankenstein, but
this could also be used by Prometheus's normal local storage to support
cancellations and timeouts at some point.

											
										
										
											2016-09-15 15:58:06 -07:00
+									n, err = s.DropMetricsForLabelMatchers(context.Background(), lmAll)
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+									if err != nil {
 										t.Fatal(err)
 									}
 									if n != 2 {
 										t.Fatalf("expected 2 series to be dropped, got %d", n)
 									}
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									s.WaitForIndexing()
-												storage: improve index lookups

tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.

In more detail:

Imagine the following query:

    nicely:aggregating:rule{job="foo",env="prod"}

While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:

    nicely:aggregating:rule{job=~"foo",env=~"prod"}

Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.

This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:

- First, sort all matchers by "expected cardinality". Matchers
  matching the empty string are always worst (and never used for
  intersections). Equal matchers are in general consider best, but by
  using some crude heuristics, we declare some better than others
  (instance labels or anything that looks like a recording rule).

- Then go through the matchers until we hit a threshold of remaining
  FPs in the intersection. This threshold is higher if we are already
  in the non-Equal matcher area as intersection is even more expensive
  here.

- Once the threshold has been reached (or we have run out of matchers
  that do not match the empty string), start with "retrieve metrics
  and check them individually against remaining matchers".

A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).

											
										
										
											2016-06-28 11:18:32 -07:00
+									fps3 := s.fingerprintsForLabelPair(model.LabelPair{
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										Name: model.MetricNameLabel, Value: "test",
-												storage: improve index lookups

tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.

In more detail:

Imagine the following query:

    nicely:aggregating:rule{job="foo",env="prod"}

While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:

    nicely:aggregating:rule{job=~"foo",env=~"prod"}

Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.

This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:

- First, sort all matchers by "expected cardinality". Matchers
  matching the empty string are always worst (and never used for
  intersections). Equal matchers are in general consider best, but by
  using some crude heuristics, we declare some better than others
  (instance labels or anything that looks like a recording rule).

- Then go through the matchers until we hit a threshold of remaining
  FPs in the intersection. This threshold is higher if we are already
  in the non-Equal matcher area as intersection is even more expensive
  here.

- Once the threshold has been reached (or we have run out of matchers
  that do not match the empty string), start with "retrieve metrics
  and check them individually against remaining matchers".

A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).

											
										
										
											2016-06-28 11:18:32 -07:00
+									}, nil, nil)
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									if len(fps3) != 0 {
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+										t.Errorf("unexpected number of fingerprints: %d", len(fps3))
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									}
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+									it = s.preloadChunksForRange(makeFingerprintSeriesPair(s, fpList[0]), model.Earliest, model.Latest)
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
+									if vals := it.RangeValues(metric.Interval{OldestInclusive: insertStart, NewestInclusive: now}); len(vals) != 0 {
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+										t.Errorf("unexpected number of samples: %d", len(vals))
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									}
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+									it = s.preloadChunksForRange(makeFingerprintSeriesPair(s, fpList[1]), model.Earliest, model.Latest)
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
+									if vals := it.RangeValues(metric.Interval{OldestInclusive: insertStart, NewestInclusive: now}); len(vals) != 0 {
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+										t.Errorf("unexpected number of samples: %d", len(vals))
 									}
 									exists, err = chunkFileExists(fpList[2])
 									if err != nil {
 										t.Fatal(err)
 									}
 									if exists {
 										t.Errorf("chunk file still exists for fp=%v", fpList[2])
-												Add storage method to delete time series

											
										
										
											2015-05-27 08:41:57 -07:00
+									}
 								}
-												Handle errors caused by data corruption more gracefully

This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...

The ideas behind this are the following:

- panic only if it's a programming error. Data corruptions happen, and
  they are not programming errors.

- If we detect a data corruption, we "quarantine" the series,
  essentially removing it from the database and putting its data into
  a separate directory for forensics.

- Failure during writing to a series file is not considered corruption
  automatically. It will call setDirty, though, so that a
  crashrecovery upon the next restart will commence and check for
  that.

- Series quarantining and setDirty calls are logged and counted in
  metrics, but are hidden from the user of the interfaces in
  interface.go, whith the notable exception of Append(). The reasoning
  is that we treat corruption by removing the corrupted series, i.e. a
  query for it will return no results on its next call anyway, so
  return no results right now. In the case of Append(), we want to
  tell the user that no data has been appended, though.

Minor side effects:

- Now consistently using filepath.* instead of path.*.

- Introduced structured logging where I touched it. This makes things
  less consistent, but a complete change to structured logging would
  be out of scope for this PR.

											
										
										
											2016-02-25 03:23:42 -08:00
+								func TestQuarantineMetric(t *testing.T) {
 									now := model.Now()
 									insertStart := now.Add(-2 * time.Hour)
-												Switch chunk encoding to type 2 where it was hardcoded type 1 before

The chunk encoding was hardcoded there because it mostly doesn't
matter what encoding is chosen in that test. Since type 1 is
battle-hardened enough, I'm switching to type 2 here so that we can
catch unexpected problems as a byproduct. My expectation is that the
chunk encoding doesn't matter anyway, as said, but then "unexpected
problems" contains the word "unexpected".

											
										
										
											2016-03-20 15:32:20 -07:00
+									s, closer := NewTestStorage(t, 2)
-												Handle errors caused by data corruption more gracefully

This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...

The ideas behind this are the following:

- panic only if it's a programming error. Data corruptions happen, and
  they are not programming errors.

- If we detect a data corruption, we "quarantine" the series,
  essentially removing it from the database and putting its data into
  a separate directory for forensics.

- Failure during writing to a series file is not considered corruption
  automatically. It will call setDirty, though, so that a
  crashrecovery upon the next restart will commence and check for
  that.

- Series quarantining and setDirty calls are logged and counted in
  metrics, but are hidden from the user of the interfaces in
  interface.go, whith the notable exception of Append(). The reasoning
  is that we treat corruption by removing the corrupted series, i.e. a
  query for it will return no results on its next call anyway, so
  return no results right now. In the case of Append(), we want to
  tell the user that no data has been appended, though.

Minor side effects:

- Now consistently using filepath.* instead of path.*.

- Introduced structured logging where I touched it. This makes things
  less consistent, but a complete change to structured logging would
  be out of scope for this PR.

											
										
										
											2016-02-25 03:23:42 -08:00
+									defer closer.Close()
 									chunkFileExists := func(fp model.Fingerprint) (bool, error) {
 										f, err := s.persistence.openChunkFileForReading(fp)
 										if err == nil {
 											f.Close()
 											return true, nil
 										}
 										if os.IsNotExist(err) {
 											return false, nil
 										}
 										return false, err
 									}
 									m1 := model.Metric{model.MetricNameLabel: "test", "n1": "v1"}
 									m2 := model.Metric{model.MetricNameLabel: "test", "n1": "v2"}
 									m3 := model.Metric{model.MetricNameLabel: "test", "n1": "v3"}
 									N := 120000
 									for j, m := range []model.Metric{m1, m2, m3} {
 										for i := 0; i < N; i++ {
 											smpl := &model.Sample{
 												Metric:    m,
 												Timestamp: insertStart.Add(time.Duration(i) * time.Millisecond), // 1 millisecond intervals.
 												Value:     model.SampleValue(j),
 											}
 											s.Append(smpl)
 										}
 									}
 									s.WaitForIndexing()
 									// Archive m3, but first maintain it so that at least something is written to disk.
 									fpToBeArchived := m3.FastFingerprint()
 									s.maintainMemorySeries(fpToBeArchived, 0)
 									s.fpLocker.Lock(fpToBeArchived)
 									s.fpToSeries.del(fpToBeArchived)
-												Clean up error propagation

Only return an error where callers are doing something with it except
simply logging and ignoring.

All the errors touched in this commit flag the storage as dirty
anyway, and that fact is logged anyway. So most of what is being
removed here is just log spam.

As discussed earlier, the class of errors that flags the storage as
dirty signals fundamental corruption, no even bubbling up a one-time
warning to the user (e.g. about incomplete results) isn't helping much
because _anything_ happening in the storage has to be doubted from
that point on (and in fact retroactively into the past, too). Flagging
the storage dirty, and alerting on it (plus marking the state in the
web UI) is the only way I can see right now.

As a byproduct, I cleaned up the setDirty method a bit and improved
the logged errors.

											
										
										
											2016-03-09 09:56:30 -08:00
+									s.persistence.archiveMetric(fpToBeArchived, m3, 0, insertStart.Add(time.Duration(N-1)*time.Millisecond))
-												Handle errors caused by data corruption more gracefully

This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...

The ideas behind this are the following:

- panic only if it's a programming error. Data corruptions happen, and
  they are not programming errors.

- If we detect a data corruption, we "quarantine" the series,
  essentially removing it from the database and putting its data into
  a separate directory for forensics.

- Failure during writing to a series file is not considered corruption
  automatically. It will call setDirty, though, so that a
  crashrecovery upon the next restart will commence and check for
  that.

- Series quarantining and setDirty calls are logged and counted in
  metrics, but are hidden from the user of the interfaces in
  interface.go, whith the notable exception of Append(). The reasoning
  is that we treat corruption by removing the corrupted series, i.e. a
  query for it will return no results on its next call anyway, so
  return no results right now. In the case of Append(), we want to
  tell the user that no data has been appended, though.

Minor side effects:

- Now consistently using filepath.* instead of path.*.

- Introduced structured logging where I touched it. This makes things
  less consistent, but a complete change to structured logging would
  be out of scope for this PR.

											
										
										
											2016-02-25 03:23:42 -08:00
+									s.fpLocker.Unlock(fpToBeArchived)
 									// Corrupt the series file for m3.
 									f, err := os.Create(s.persistence.fileNameForFingerprint(fpToBeArchived))
 									if err != nil {
 										t.Fatal(err)
 									}
 									if _, err := f.WriteString("This is clearly not the content of a series file."); err != nil {
 										t.Fatal(err)
 									}
 									if f.Close(); err != nil {
 										t.Fatal(err)
 									}
-												storage: improve index lookups

tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.

In more detail:

Imagine the following query:

    nicely:aggregating:rule{job="foo",env="prod"}

While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:

    nicely:aggregating:rule{job=~"foo",env=~"prod"}

Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.

This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:

- First, sort all matchers by "expected cardinality". Matchers
  matching the empty string are always worst (and never used for
  intersections). Equal matchers are in general consider best, but by
  using some crude heuristics, we declare some better than others
  (instance labels or anything that looks like a recording rule).

- Then go through the matchers until we hit a threshold of remaining
  FPs in the intersection. This threshold is higher if we are already
  in the non-Equal matcher area as intersection is even more expensive
  here.

- Once the threshold has been reached (or we have run out of matchers
  that do not match the empty string), start with "retrieve metrics
  and check them individually against remaining matchers".

A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).

											
										
										
											2016-06-28 11:18:32 -07:00
+									fps := s.fingerprintsForLabelPair(model.LabelPair{
 										Name: model.MetricNameLabel, Value: "test",
 									}, nil, nil)
-												Handle errors caused by data corruption more gracefully

This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...

The ideas behind this are the following:

- panic only if it's a programming error. Data corruptions happen, and
  they are not programming errors.

- If we detect a data corruption, we "quarantine" the series,
  essentially removing it from the database and putting its data into
  a separate directory for forensics.

- Failure during writing to a series file is not considered corruption
  automatically. It will call setDirty, though, so that a
  crashrecovery upon the next restart will commence and check for
  that.

- Series quarantining and setDirty calls are logged and counted in
  metrics, but are hidden from the user of the interfaces in
  interface.go, whith the notable exception of Append(). The reasoning
  is that we treat corruption by removing the corrupted series, i.e. a
  query for it will return no results on its next call anyway, so
  return no results right now. In the case of Append(), we want to
  tell the user that no data has been appended, though.

Minor side effects:

- Now consistently using filepath.* instead of path.*.

- Introduced structured logging where I touched it. This makes things
  less consistent, but a complete change to structured logging would
  be out of scope for this PR.

											
										
										
											2016-02-25 03:23:42 -08:00
+									if len(fps) != 3 {
 										t.Errorf("unexpected number of fingerprints: %d", len(fps))
 									}
 									// This will access the corrupt file and lead to quarantining.
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+									iter := s.preloadChunksForInstant(makeFingerprintSeriesPair(s, fpToBeArchived), now.Add(-2*time.Hour-1*time.Minute), now.Add(-2*time.Hour))
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+									iter.Close()
-												Handle errors caused by data corruption more gracefully

This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...

The ideas behind this are the following:

- panic only if it's a programming error. Data corruptions happen, and
  they are not programming errors.

- If we detect a data corruption, we "quarantine" the series,
  essentially removing it from the database and putting its data into
  a separate directory for forensics.

- Failure during writing to a series file is not considered corruption
  automatically. It will call setDirty, though, so that a
  crashrecovery upon the next restart will commence and check for
  that.

- Series quarantining and setDirty calls are logged and counted in
  metrics, but are hidden from the user of the interfaces in
  interface.go, whith the notable exception of Append(). The reasoning
  is that we treat corruption by removing the corrupted series, i.e. a
  query for it will return no results on its next call anyway, so
  return no results right now. In the case of Append(), we want to
  tell the user that no data has been appended, though.

Minor side effects:

- Now consistently using filepath.* instead of path.*.

- Introduced structured logging where I touched it. This makes things
  less consistent, but a complete change to structured logging would
  be out of scope for this PR.

											
										
										
											2016-02-25 03:23:42 -08:00
+									time.Sleep(time.Second) // Give time to quarantine. TODO(beorn7): Find a better way to wait.
 									s.WaitForIndexing()
-												storage: improve index lookups

tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.

In more detail:

Imagine the following query:

    nicely:aggregating:rule{job="foo",env="prod"}

While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:

    nicely:aggregating:rule{job=~"foo",env=~"prod"}

Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.

This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:

- First, sort all matchers by "expected cardinality". Matchers
  matching the empty string are always worst (and never used for
  intersections). Equal matchers are in general consider best, but by
  using some crude heuristics, we declare some better than others
  (instance labels or anything that looks like a recording rule).

- Then go through the matchers until we hit a threshold of remaining
  FPs in the intersection. This threshold is higher if we are already
  in the non-Equal matcher area as intersection is even more expensive
  here.

- Once the threshold has been reached (or we have run out of matchers
  that do not match the empty string), start with "retrieve metrics
  and check them individually against remaining matchers".

A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).

											
										
										
											2016-06-28 11:18:32 -07:00
+									fps2 := s.fingerprintsForLabelPair(model.LabelPair{
-												Handle errors caused by data corruption more gracefully

This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...

The ideas behind this are the following:

- panic only if it's a programming error. Data corruptions happen, and
  they are not programming errors.

- If we detect a data corruption, we "quarantine" the series,
  essentially removing it from the database and putting its data into
  a separate directory for forensics.

- Failure during writing to a series file is not considered corruption
  automatically. It will call setDirty, though, so that a
  crashrecovery upon the next restart will commence and check for
  that.

- Series quarantining and setDirty calls are logged and counted in
  metrics, but are hidden from the user of the interfaces in
  interface.go, whith the notable exception of Append(). The reasoning
  is that we treat corruption by removing the corrupted series, i.e. a
  query for it will return no results on its next call anyway, so
  return no results right now. In the case of Append(), we want to
  tell the user that no data has been appended, though.

Minor side effects:

- Now consistently using filepath.* instead of path.*.

- Introduced structured logging where I touched it. This makes things
  less consistent, but a complete change to structured logging would
  be out of scope for this PR.

											
										
										
											2016-02-25 03:23:42 -08:00
+										Name: model.MetricNameLabel, Value: "test",
-												storage: improve index lookups

tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.

In more detail:

Imagine the following query:

    nicely:aggregating:rule{job="foo",env="prod"}

While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:

    nicely:aggregating:rule{job=~"foo",env=~"prod"}

Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.

This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:

- First, sort all matchers by "expected cardinality". Matchers
  matching the empty string are always worst (and never used for
  intersections). Equal matchers are in general consider best, but by
  using some crude heuristics, we declare some better than others
  (instance labels or anything that looks like a recording rule).

- Then go through the matchers until we hit a threshold of remaining
  FPs in the intersection. This threshold is higher if we are already
  in the non-Equal matcher area as intersection is even more expensive
  here.

- Once the threshold has been reached (or we have run out of matchers
  that do not match the empty string), start with "retrieve metrics
  and check them individually against remaining matchers".

A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).

											
										
										
											2016-06-28 11:18:32 -07:00
+									}, nil, nil)
-												Handle errors caused by data corruption more gracefully

This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...

The ideas behind this are the following:

- panic only if it's a programming error. Data corruptions happen, and
  they are not programming errors.

- If we detect a data corruption, we "quarantine" the series,
  essentially removing it from the database and putting its data into
  a separate directory for forensics.

- Failure during writing to a series file is not considered corruption
  automatically. It will call setDirty, though, so that a
  crashrecovery upon the next restart will commence and check for
  that.

- Series quarantining and setDirty calls are logged and counted in
  metrics, but are hidden from the user of the interfaces in
  interface.go, whith the notable exception of Append(). The reasoning
  is that we treat corruption by removing the corrupted series, i.e. a
  query for it will return no results on its next call anyway, so
  return no results right now. In the case of Append(), we want to
  tell the user that no data has been appended, though.

Minor side effects:

- Now consistently using filepath.* instead of path.*.

- Introduced structured logging where I touched it. This makes things
  less consistent, but a complete change to structured logging would
  be out of scope for this PR.

											
										
										
											2016-02-25 03:23:42 -08:00
+									if len(fps2) != 2 {
 										t.Errorf("unexpected number of fingerprints: %d", len(fps2))
 									}
 									exists, err := chunkFileExists(fpToBeArchived)
 									if err != nil {
 										t.Fatal(err)
 									}
 									if exists {
 										t.Errorf("chunk file exists for fp=%v", fpToBeArchived)
 									}
 								}
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+								// TestLoop is just a smoke test for the loop method, if we can switch it on and
 								// off without disaster.
 								func TestLoop(t *testing.T) {
-												Fix the embarrassing bug introduced in commit 0851945.

In that commit, the 'maintainSeries' call was accidentally removed.

This commit refactors things a bit so that there is now a clean
'maintainMemorySeries' and a 'maintainArchivedSeries' call.

Straighten the nomenclature a bit (consistently use 'drop' for
chunks and 'purge' for series/metrics).

Remove the annoying 'Completed maintenance sweep through archived
fingerprints' message if there were no archived fingerprints to do
maintenance on.

											
										
										
											2015-02-26 06:19:44 -08:00
+									if testing.Short() {
 										t.Skip("Skipping test in short mode.")
 									}
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									samples := make(model.Samples, 1000)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									for i := range samples {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										samples[i] = &model.Sample{
 											Timestamp: model.Time(2 * i),
 											Value:     model.SampleValue(float64(i) * 0.2),
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+										}
 									}
-												Move test package to pkg/testutil

											
										
										
											2015-05-28 11:58:38 -07:00
+									directory := testutil.NewTemporaryDirectory("test_storage", t)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									defer directory.Close()
 									o := &MemorySeriesStorageOptions{
-												storage: Evict chunks and calculate persistence pressure based on target heap size

This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".

The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).

This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.

Answers to anticipated comments:

There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.

One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.

											
										
										
											2017-03-01 06:17:31 -08:00
+										TargetHeapSize:             100000,
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+										PersistenceRetentionPeriod: 24 * 7 * time.Hour,
 										PersistenceStoragePath:     directory.Path(),
-												storage: Use staleness delta as head chunk timeout

Currently, if a series stops to exist, its head chunk will be kept
open for an hour. That prevents it from being persisted. Which
prevents it from being evicted. Which prevents the series from being
archived.

Most of the time, once no sample has been added to a series within the
staleness limit, we can be pretty confident that this series will not
receive samples anymore. The whole chain as described above can be
started after 5m instead of 1h. In the relaxed case, this doesn't
change a lot as the head chunk timeout is only checked during series
maintenance, and usually, a series is only maintained every six
hours. However, there is the typical scenario where a large service is
deployed, the deoply turns out to be bad, and then it is deployed
again within minutes, and quite quickly the number of time series has
tripled. That's the point where the Prometheus server is stressed and
switches (rightfully) into rushed mode. In that mode, time series are
processed as quickly as possible, but all of that is in vein if all of
those recently ended time series cannot be persisted yet for another
hour. In that scenario, this change will help most, and it's exactly
the scenario where help is most desperately needed.

											
										
										
											2017-03-26 14:44:50 -07:00
+										HeadChunkTimeout:           5 * time.Minute,
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+										CheckpointInterval:         250 * time.Millisecond,
-												Increase resilience of the storage against data corruption - step 4.

Step 4: Add a configurable sync'ing of series files after modification.

											
										
										
											2015-03-19 07:41:50 -07:00
+										SyncStrategy:               Adaptive,
-												Improve handling of series file truncation

If only very few chunks are to be truncated from a very large series
file, the rewrite of the file is a lorge overhead. With this change, a
certain ratio of the file has to be dropped to make it happen. While
only causing disk overhead at about the same ratio (by default 10%),
it will cut down I/O by a lot in above scenario.

											
										
										
											2016-01-11 07:42:10 -08:00
+										MinShrinkRatio:             0.1,
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									}
-												Do not start storage processing before Start() is called.

											
										
										
											2015-05-18 10:26:28 -07:00
+									storage := NewMemorySeriesStorage(o)
-												Fix storage test

											
										
										
											2015-05-20 07:12:07 -07:00
+									if err := storage.Start(); err != nil {
-												Fix DropMetricsForFingerprints

It now deletes the series file also for archived series.

Also, fix a naming error in a doc comment.

											
										
										
											2015-09-11 06:47:23 -07:00
+										t.Errorf("Error starting storage: %s", err)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									}
-												Remove the sample ingestion channel.

The one central sample ingestion channel has caused a variety of
trouble. This commit removes it. Targets and rule evaluation call an
Append method directly now. To incorporate multiple storage backends
(like OpenTSDB), storage.Tee forks the Append into two different
appenders.

Note that the tsdb queue manager had its own queue anyway. It was a
queue after a queue... Much queue, so overhead...

Targets have their own little buffer (implemented as a channel) to
avoid stalling during an http scrape. But a new scrape will only be
started once the old one is fully ingested.

The contraption of three pipelined ingesters was removed. A Target is
an ingester itself now. Despite more logic in Target, things should be
less confusing now.

Also, remove lint and vet warnings in ast.go.

											
										
										
											2015-03-14 19:36:15 -07:00
+									for _, s := range samples {
 										storage.Append(s)
 									}
-												Fix the embarrassing bug introduced in commit 0851945.

In that commit, the 'maintainSeries' call was accidentally removed.

This commit refactors things a bit so that there is now a clean
'maintainMemorySeries' and a 'maintainArchivedSeries' call.

Straighten the nomenclature a bit (consistently use 'drop' for
chunks and 'purge' for series/metrics).

Remove the annoying 'Completed maintenance sweep through archived
fingerprints' message if there were no archived fingerprints to do
maintenance on.

											
										
										
											2015-02-26 06:19:44 -08:00
+									storage.WaitForIndexing()
-												storage: Remove race condition from TestLoop

											
										
										
											2017-02-01 10:41:15 -08:00
+									fp := model.Metric{}.FastFingerprint()
 									series, _ := storage.fpToSeries.get(fp)
 									storage.fpLocker.Lock(fp)
-												Fix the embarrassing bug introduced in commit 0851945.

In that commit, the 'maintainSeries' call was accidentally removed.

This commit refactors things a bit so that there is now a clean
'maintainMemorySeries' and a 'maintainArchivedSeries' call.

Straighten the nomenclature a bit (consistently use 'drop' for
chunks and 'purge' for series/metrics).

Remove the annoying 'Completed maintenance sweep through archived
fingerprints' message if there were no archived fingerprints to do
maintenance on.

											
										
										
											2015-02-26 06:19:44 -08:00
+									cdsBefore := len(series.chunkDescs)
-												storage: Remove race condition from TestLoop

											
										
										
											2017-02-01 10:41:15 -08:00
+									storage.fpLocker.Unlock(fp)
-												Fix the embarrassing bug introduced in commit 0851945.

In that commit, the 'maintainSeries' call was accidentally removed.

This commit refactors things a bit so that there is now a clean
'maintainMemorySeries' and a 'maintainArchivedSeries' call.

Straighten the nomenclature a bit (consistently use 'drop' for
chunks and 'purge' for series/metrics).

Remove the annoying 'Completed maintenance sweep through archived
fingerprints' message if there were no archived fingerprints to do
maintenance on.

											
										
										
											2015-02-26 06:19:44 -08:00
+									time.Sleep(fpMaxWaitDuration + time.Second) // TODO(beorn7): Ugh, need to wait for maintenance to kick in.
-												storage: Remove race condition from TestLoop

											
										
										
											2017-02-01 10:41:15 -08:00
+									storage.fpLocker.Lock(fp)
-												Fix the embarrassing bug introduced in commit 0851945.

In that commit, the 'maintainSeries' call was accidentally removed.

This commit refactors things a bit so that there is now a clean
'maintainMemorySeries' and a 'maintainArchivedSeries' call.

Straighten the nomenclature a bit (consistently use 'drop' for
chunks and 'purge' for series/metrics).

Remove the annoying 'Completed maintenance sweep through archived
fingerprints' message if there were no archived fingerprints to do
maintenance on.

											
										
										
											2015-02-26 06:19:44 -08:00
+									cdsAfter := len(series.chunkDescs)
-												storage: Remove race condition from TestLoop

											
										
										
											2017-02-01 10:41:15 -08:00
+									storage.fpLocker.Unlock(fp)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									storage.Stop()
-												Fix the embarrassing bug introduced in commit 0851945.

In that commit, the 'maintainSeries' call was accidentally removed.

This commit refactors things a bit so that there is now a clean
'maintainMemorySeries' and a 'maintainArchivedSeries' call.

Straighten the nomenclature a bit (consistently use 'drop' for
chunks and 'purge' for series/metrics).

Remove the annoying 'Completed maintenance sweep through archived
fingerprints' message if there were no archived fingerprints to do
maintenance on.

											
										
										
											2015-02-26 06:19:44 -08:00
+									if cdsBefore <= cdsAfter {
 										t.Errorf(
 											"Number of chunk descriptors should have gone down by now. Got before %d, after %d.",
 											cdsBefore, cdsAfter,
 										)
 									}
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+								}
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+								func testChunk(t *testing.T, encoding chunk.Encoding) {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									samples := make(model.Samples, 500000)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+									for i := range samples {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										samples[i] = &model.Sample{
 											Timestamp: model.Time(i),
 											Value:     model.SampleValue(float64(i) * 0.2),
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+										}
 									}
-												Improve various things around chunk encoding.

A number of mostly minor things:

- Rename chunk type -> chunk encoding.

- After all, do not carry around the chunk encoding to all parts of
  the system, but just have one place where the encoding for new
  chunks is set based on the flag. The new approach has caveats as
  well, but the polution of so many method signatures is worse.

- Use the default chunk encoding for new chunks of existing
  series. (Previously, only new _series_ would get chunks with the
  default encoding.)

- Use an enum for chunk encoding. (But keep the version number for the
  flag, for reasons discussed previously.)

- Add encoding() to the chunk interface (so that a chunk knows its own
  encoding - no need to have that in a different top-level function).

- Got rid of newFollowUpChunk (which would keep the existing encoding
  for all chunks of a time series). Now only use newChunk(), which
  will create a chunk encoding according to the flag.

- Simplified transcodeAndAdd.

- Reordered methods of deltaEncodedChunk and doubleDeltaEncoded chunk
  to match the order in the chunk interface.

- Only transcode if the chunk is not yet half full. If more than half
  full, add a new chunk instead.

											
										
										
											2015-03-13 07:49:07 -07:00
+									s, closer := NewTestStorage(t, encoding)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+									defer closer.Close()
-												Remove the sample ingestion channel.

The one central sample ingestion channel has caused a variety of
trouble. This commit removes it. Targets and rule evaluation call an
Append method directly now. To incorporate multiple storage backends
(like OpenTSDB), storage.Tee forks the Append into two different
appenders.

Note that the tsdb queue manager had its own queue anyway. It was a
queue after a queue... Much queue, so overhead...

Targets have their own little buffer (implemented as a channel) to
avoid stalling during an http scrape. But a new scrape will only be
started once the old one is fully ingested.

The contraption of three pipelined ingesters was removed. A Target is
an ingester itself now. Despite more logic in Target, things should be
less confusing now.

Also, remove lint and vet warnings in ast.go.

											
										
										
											2015-03-14 19:36:15 -07:00
+									for _, sample := range samples {
 										s.Append(sample)
 									}
-												Improve performance of ingestion.

- Parallelize AppendSamples as much as possible without breaking the
  contract about temporal order.

- Allocate more fingerprint locker slots.

- Do not run early checkpoints if we are behind on chunk persistence.

- Increase fpMinWaitDuration to give the disk more time for more
  important things.

Also, switch math.MaxInt64 and math.MinInt64 to the new constants.

											
										
										
											2015-02-12 08:23:42 -08:00
+									s.WaitForIndexing()
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
-												Handle fingerprint collisions.

											
										
										
											2015-05-06 07:53:12 -07:00
+									for m := range s.fpToSeries.iter() {
 										s.fpLocker.Lock(m.fp)
-												Replace metric.SamplePair with model.SamplePair

											
										
										
											2015-08-22 05:52:35 -07:00
+										var values []model.SamplePair
-												Evict based on memory pressure. Evict recently used chunks last.

Change-Id: Ie6168f0cdb3917bdc63b6fe15585dd70c1e42afe

											
										
										
											2014-11-13 11:50:25 -08:00
+										for _, cd := range m.series.chunkDescs {
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+											if cd.IsEvicted() {
-												Evict based on memory pressure. Evict recently used chunks last.

Change-Id: Ie6168f0cdb3917bdc63b6fe15585dd70c1e42afe

											
										
										
											2014-11-13 11:50:25 -08:00
+												continue
 											}
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+											it := cd.C.NewIterator()
-												timeseries: store varbit encoded data into cassandra

											
										
										
											2016-09-21 08:56:55 -07:00
+											for it.Scan() {
 												values = append(values, it.Value())
-												Slim down the chunkIterator interface

For one, remove unneeded methods.

Then, instead of using a channel for all values, use a
bufio.Scanner-like interface. This removes the need for creating a
goroutine and avoids the (unnecessary) locking performed by channel
sending and receiving.

This will make it much easier to write new chunk implementations (like
Gorilla-style encoding).

											
										
										
											2016-03-07 10:50:13 -08:00
+											}
-												timeseries: store varbit encoded data into cassandra

											
										
										
											2016-09-21 08:56:55 -07:00
+											if it.Err() != nil {
 												t.Error(it.Err())
-												Evict based on memory pressure. Evict recently used chunks last.

Change-Id: Ie6168f0cdb3917bdc63b6fe15585dd70c1e42afe

											
										
										
											2014-11-13 11:50:25 -08:00
+											}
 										}
 										for i, v := range values {
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+											if samples[i].Timestamp != v.Timestamp {
-												Evict based on memory pressure. Evict recently used chunks last.

Change-Id: Ie6168f0cdb3917bdc63b6fe15585dd70c1e42afe

											
										
										
											2014-11-13 11:50:25 -08:00
+												t.Errorf("%d. Got %v; want %v", i, v.Timestamp, samples[i].Timestamp)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+											}
-												Make floats exact again.

This should do the right thing for the old delta chunks, too.

											
										
										
											2015-03-06 07:03:03 -08:00
+											if samples[i].Value != v.Value {
-												Evict based on memory pressure. Evict recently used chunks last.

Change-Id: Ie6168f0cdb3917bdc63b6fe15585dd70c1e42afe

											
										
										
											2014-11-13 11:50:25 -08:00
+												t.Errorf("%d. Got %v; want %v", i, v.Value, samples[i].Value)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+											}
 										}
-												storage: Evict chunks and calculate persistence pressure based on target heap size

This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".

The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).

This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.

Answers to anticipated comments:

There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.

One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.

											
										
										
											2017-03-01 06:17:31 -08:00
+										s.fpLocker.Unlock(m.fp)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+									}
-												Switch Prometheus to use github.com/prometheus/log.

This change is conceptually very simple, although the diff is large. It
switches logging from "github.com/golang/glog" to
"github.com/prometheus/log", while not actually changing any log
messages. V(1)-style logging has been changed to be log.Debug*().

											
										
										
											2015-05-20 09:10:29 -07:00
+									log.Info("test done, closing")
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+								}
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+								func TestChunkType0(t *testing.T) {
 									testChunk(t, 0)
 								}
 								func TestChunkType1(t *testing.T) {
 									testChunk(t, 1)
 								}
-												Implement Gorilla-inspired chunk encoding

This is not a verbatim implementation of the Gorilla encoding.  First
of all, it could not, even if we wanted, because Prometheus has a
different chunking model (constant size, not constant time).  Second,
this adds a number of changes that improve the encoding in general or
at least for the specific use case of Prometheus (and are partially
only possible in the context of Prometheus). See comments in the code
for details.

											
										
										
											2016-03-12 12:34:51 -08:00
+								func TestChunkType2(t *testing.T) {
 									testChunk(t, 2)
 								}
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+								func testValueAtOrBeforeTime(t *testing.T, encoding chunk.Encoding) {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									samples := make(model.Samples, 10000)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+									for i := range samples {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										samples[i] = &model.Sample{
 											Timestamp: model.Time(2 * i),
 											Value:     model.SampleValue(float64(i) * 0.2),
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+										}
 									}
-												Improve various things around chunk encoding.

A number of mostly minor things:

- Rename chunk type -> chunk encoding.

- After all, do not carry around the chunk encoding to all parts of
  the system, but just have one place where the encoding for new
  chunks is set based on the flag. The new approach has caveats as
  well, but the polution of so many method signatures is worse.

- Use the default chunk encoding for new chunks of existing
  series. (Previously, only new _series_ would get chunks with the
  default encoding.)

- Use an enum for chunk encoding. (But keep the version number for the
  flag, for reasons discussed previously.)

- Add encoding() to the chunk interface (so that a chunk knows its own
  encoding - no need to have that in a different top-level function).

- Got rid of newFollowUpChunk (which would keep the existing encoding
  for all chunks of a time series). Now only use newChunk(), which
  will create a chunk encoding according to the flag.

- Simplified transcodeAndAdd.

- Reordered methods of deltaEncodedChunk and doubleDeltaEncoded chunk
  to match the order in the chunk interface.

- Only transcode if the chunk is not yet half full. If more than half
  full, add a new chunk instead.

											
										
										
											2015-03-13 07:49:07 -07:00
+									s, closer := NewTestStorage(t, encoding)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+									defer closer.Close()
-												Remove the sample ingestion channel.

The one central sample ingestion channel has caused a variety of
trouble. This commit removes it. Targets and rule evaluation call an
Append method directly now. To incorporate multiple storage backends
(like OpenTSDB), storage.Tee forks the Append into two different
appenders.

Note that the tsdb queue manager had its own queue anyway. It was a
queue after a queue... Much queue, so overhead...

Targets have their own little buffer (implemented as a channel) to
avoid stalling during an http scrape. But a new scrape will only be
started once the old one is fully ingested.

The contraption of three pipelined ingesters was removed. A Target is
an ingester itself now. Despite more logic in Target, things should be
less confusing now.

Also, remove lint and vet warnings in ast.go.

											
										
										
											2015-03-14 19:36:15 -07:00
+									for _, sample := range samples {
 										s.Append(sample)
 									}
-												Improve performance of ingestion.

- Parallelize AppendSamples as much as possible without breaking the
  contract about temporal order.

- Allocate more fingerprint locker slots.

- Do not run early checkpoints if we are behind on chunk persistence.

- Increase fpMinWaitDuration to give the disk more time for more
  important things.

Also, switch math.MaxInt64 and math.MinInt64 to the new constants.

											
										
										
											2015-02-12 08:23:42 -08:00
+									s.WaitForIndexing()
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									fp := model.Metric{}.FastFingerprint()
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+									it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), model.Earliest, model.Latest)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+									// #1 Exactly on a sample.
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+									for i, expected := range samples {
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+										actual := it.ValueAtOrBeforeTime(expected.Timestamp)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+										if expected.Timestamp != actual.Timestamp {
 											t.Errorf("1.%d. Got %v; want %v", i, actual.Timestamp, expected.Timestamp)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+										}
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+										if expected.Value != actual.Value {
 											t.Errorf("1.%d. Got %v; want %v", i, actual.Value, expected.Value)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+										}
 									}
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
 									// #2 Between samples.
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+									for i, expected := range samples {
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										if i == len(samples)-1 {
 											continue
 										}
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+										actual := it.ValueAtOrBeforeTime(expected.Timestamp + 1)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+										if expected.Timestamp != actual.Timestamp {
 											t.Errorf("2.%d. Got %v; want %v", i, actual.Timestamp, expected.Timestamp)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										}
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+										if expected.Value != actual.Value {
 											t.Errorf("2.%d. Got %v; want %v", i, actual.Value, expected.Value)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										}
 									}
 									// #3 Corner cases: Just before the first sample, just after the last.
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+									expected := &model.Sample{Timestamp: model.Earliest}
 									actual := it.ValueAtOrBeforeTime(samples[0].Timestamp - 1)
 									if expected.Timestamp != actual.Timestamp {
 										t.Errorf("3.1. Got %v; want %v", actual.Timestamp, expected.Timestamp)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+									}
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+									if expected.Value != actual.Value {
 										t.Errorf("3.1. Got %v; want %v", actual.Value, expected.Value)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+									}
 									expected = samples[len(samples)-1]
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+									actual = it.ValueAtOrBeforeTime(expected.Timestamp + 1)
 									if expected.Timestamp != actual.Timestamp {
 										t.Errorf("3.2. Got %v; want %v", actual.Timestamp, expected.Timestamp)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+									}
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+									if expected.Value != actual.Value {
 										t.Errorf("3.2. Got %v; want %v", actual.Value, expected.Value)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+									}
-												Add test to expose #2965

											
										
										
											2017-07-21 07:25:24 -07:00
 									// #4 Query alternatingly exactly on and just between timestamps.
 									// Exposes issue #2965.
 									for i, expected := range samples {
 										i *= 2
 										actual := it.ValueAtOrBeforeTime(expected.Timestamp)
 										if expected.Timestamp != actual.Timestamp {
 											t.Errorf("4.%d. Got %v; want %v", i, actual.Timestamp, expected.Timestamp)
 										}
 										if expected.Value != actual.Value {
 											t.Errorf("4.%d. Got %v; want %v", i, actual.Value, expected.Value)
 										}
 										i++
 										actual = it.ValueAtOrBeforeTime(expected.Timestamp + 1)
 										if expected.Timestamp != actual.Timestamp {
 											t.Errorf("4.%d. Got %v; want %v", i, actual.Timestamp, expected.Timestamp)
 										}
 										if expected.Value != actual.Value {
 											t.Errorf("4.%d. Got %v; want %v", i, actual.Value, expected.Value)
 										}
 									}
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+								}
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+								func TestValueAtTimeChunkType0(t *testing.T) {
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+									testValueAtOrBeforeTime(t, 0)
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+								}
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+								func TestValueAtTimeChunkType1(t *testing.T) {
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+									testValueAtOrBeforeTime(t, 1)
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+								}
-												Implement Gorilla-inspired chunk encoding

This is not a verbatim implementation of the Gorilla encoding.  First
of all, it could not, even if we wanted, because Prometheus has a
different chunking model (constant size, not constant time).  Second,
this adds a number of changes that improve the encoding in general or
at least for the specific use case of Prometheus (and are partially
only possible in the context of Prometheus). See comments in the code
for details.

											
										
										
											2016-03-12 12:34:51 -08:00
+								func TestValueAtTimeChunkType2(t *testing.T) {
 									testValueAtOrBeforeTime(t, 2)
 								}
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+								func benchmarkValueAtOrBeforeTime(b *testing.B, encoding chunk.Encoding) {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									samples := make(model.Samples, 10000)
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+									for i := range samples {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										samples[i] = &model.Sample{
 											Timestamp: model.Time(2 * i),
 											Value:     model.SampleValue(float64(i) * 0.2),
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+										}
 									}
 									s, closer := NewTestStorage(b, encoding)
 									defer closer.Close()
 									for _, sample := range samples {
 										s.Append(sample)
 									}
 									s.WaitForIndexing()
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									fp := model.Metric{}.FastFingerprint()
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+									it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), model.Earliest, model.Latest)
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+									b.ResetTimer()
 									for i := 0; i < b.N; i++ {
 										// #1 Exactly on a sample.
 										for i, expected := range samples {
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+											actual := it.ValueAtOrBeforeTime(expected.Timestamp)
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+											if expected.Timestamp != actual.Timestamp {
 												b.Errorf("1.%d. Got %v; want %v", i, actual.Timestamp, expected.Timestamp)
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+											}
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+											if expected.Value != actual.Value {
 												b.Errorf("1.%d. Got %v; want %v", i, actual.Value, expected.Value)
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+											}
 										}
 										// #2 Between samples.
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+										for i, expected := range samples {
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+											if i == len(samples)-1 {
 												continue
 											}
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+											actual := it.ValueAtOrBeforeTime(expected.Timestamp + 1)
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+											if expected.Timestamp != actual.Timestamp {
 												b.Errorf("2.%d. Got %v; want %v", i, actual.Timestamp, expected.Timestamp)
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+											}
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+											if expected.Value != actual.Value {
 												b.Errorf("2.%d. Got %v; want %v", i, actual.Value, expected.Value)
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+											}
 										}
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
 										// #3 Corner cases: Just before the first sample, just after the last.
 										expected := &model.Sample{Timestamp: model.Earliest}
 										actual := it.ValueAtOrBeforeTime(samples[0].Timestamp - 1)
 										if expected.Timestamp != actual.Timestamp {
 											b.Errorf("3.1. Got %v; want %v", actual.Timestamp, expected.Timestamp)
 										}
 										if expected.Value != actual.Value {
 											b.Errorf("3.1. Got %v; want %v", actual.Value, expected.Value)
 										}
 										expected = samples[len(samples)-1]
 										actual = it.ValueAtOrBeforeTime(expected.Timestamp + 1)
 										if expected.Timestamp != actual.Timestamp {
 											b.Errorf("3.2. Got %v; want %v", actual.Timestamp, expected.Timestamp)
 										}
 										if expected.Value != actual.Value {
 											b.Errorf("3.2. Got %v; want %v", actual.Value, expected.Value)
 										}
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+									}
 								}
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+								func BenchmarkValueAtOrBeforeTimeChunkType0(b *testing.B) {
 									benchmarkValueAtOrBeforeTime(b, 0)
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+								}
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+								func BenchmarkValueAtTimeChunkType1(b *testing.B) {
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+									benchmarkValueAtOrBeforeTime(b, 1)
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+								}
-												Implement Gorilla-inspired chunk encoding

This is not a verbatim implementation of the Gorilla encoding.  First
of all, it could not, even if we wanted, because Prometheus has a
different chunking model (constant size, not constant time).  Second,
this adds a number of changes that improve the encoding in general or
at least for the specific use case of Prometheus (and are partially
only possible in the context of Prometheus). See comments in the code
for details.

											
										
										
											2016-03-12 12:34:51 -08:00
+								func BenchmarkValueAtTimeChunkType2(b *testing.B) {
 									benchmarkValueAtOrBeforeTime(b, 2)
 								}
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+								func testRangeValues(t *testing.T, encoding chunk.Encoding) {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									samples := make(model.Samples, 10000)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+									for i := range samples {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										samples[i] = &model.Sample{
 											Timestamp: model.Time(2 * i),
 											Value:     model.SampleValue(float64(i) * 0.2),
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+										}
 									}
-												Improve various things around chunk encoding.

A number of mostly minor things:

- Rename chunk type -> chunk encoding.

- After all, do not carry around the chunk encoding to all parts of
  the system, but just have one place where the encoding for new
  chunks is set based on the flag. The new approach has caveats as
  well, but the polution of so many method signatures is worse.

- Use the default chunk encoding for new chunks of existing
  series. (Previously, only new _series_ would get chunks with the
  default encoding.)

- Use an enum for chunk encoding. (But keep the version number for the
  flag, for reasons discussed previously.)

- Add encoding() to the chunk interface (so that a chunk knows its own
  encoding - no need to have that in a different top-level function).

- Got rid of newFollowUpChunk (which would keep the existing encoding
  for all chunks of a time series). Now only use newChunk(), which
  will create a chunk encoding according to the flag.

- Simplified transcodeAndAdd.

- Reordered methods of deltaEncodedChunk and doubleDeltaEncoded chunk
  to match the order in the chunk interface.

- Only transcode if the chunk is not yet half full. If more than half
  full, add a new chunk instead.

											
										
										
											2015-03-13 07:49:07 -07:00
+									s, closer := NewTestStorage(t, encoding)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+									defer closer.Close()
-												Remove the sample ingestion channel.

The one central sample ingestion channel has caused a variety of
trouble. This commit removes it. Targets and rule evaluation call an
Append method directly now. To incorporate multiple storage backends
(like OpenTSDB), storage.Tee forks the Append into two different
appenders.

Note that the tsdb queue manager had its own queue anyway. It was a
queue after a queue... Much queue, so overhead...

Targets have their own little buffer (implemented as a channel) to
avoid stalling during an http scrape. But a new scrape will only be
started once the old one is fully ingested.

The contraption of three pipelined ingesters was removed. A Target is
an ingester itself now. Despite more logic in Target, things should be
less confusing now.

Also, remove lint and vet warnings in ast.go.

											
										
										
											2015-03-14 19:36:15 -07:00
+									for _, sample := range samples {
 										s.Append(sample)
 									}
-												Improve performance of ingestion.

- Parallelize AppendSamples as much as possible without breaking the
  contract about temporal order.

- Allocate more fingerprint locker slots.

- Do not run early checkpoints if we are behind on chunk persistence.

- Increase fpMinWaitDuration to give the disk more time for more
  important things.

Also, switch math.MaxInt64 and math.MinInt64 to the new constants.

											
										
										
											2015-02-12 08:23:42 -08:00
+									s.WaitForIndexing()
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									fp := model.Metric{}.FastFingerprint()
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+									it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), model.Earliest, model.Latest)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+									// #1 Zero length interval at sample.
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+									for i, expected := range samples {
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+										actual := it.RangeValues(metric.Interval{
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+											OldestInclusive: expected.Timestamp,
 											NewestInclusive: expected.Timestamp,
 										})
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										if len(actual) != 1 {
 											t.Fatalf("1.%d. Expected exactly one result, got %d.", i, len(actual))
 										}
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+										if expected.Timestamp != actual[0].Timestamp {
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+											t.Errorf("1.%d. Got %v; want %v.", i, actual[0].Timestamp, expected.Timestamp)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+										}
 										if expected.Value != actual[0].Value {
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+											t.Errorf("1.%d. Got %v; want %v.", i, actual[0].Value, expected.Value)
 										}
 									}
 									// #2 Zero length interval off sample.
 									for i, expected := range samples {
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+										actual := it.RangeValues(metric.Interval{
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+											OldestInclusive: expected.Timestamp + 1,
 											NewestInclusive: expected.Timestamp + 1,
 										})
 										if len(actual) != 0 {
 											t.Fatalf("2.%d. Expected no result, got %d.", i, len(actual))
 										}
 									}
 									// #3 2sec interval around sample.
 									for i, expected := range samples {
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+										actual := it.RangeValues(metric.Interval{
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+											OldestInclusive: expected.Timestamp - 1,
 											NewestInclusive: expected.Timestamp + 1,
 										})
 										if len(actual) != 1 {
 											t.Fatalf("3.%d. Expected exactly one result, got %d.", i, len(actual))
 										}
 										if expected.Timestamp != actual[0].Timestamp {
 											t.Errorf("3.%d. Got %v; want %v.", i, actual[0].Timestamp, expected.Timestamp)
 										}
 										if expected.Value != actual[0].Value {
 											t.Errorf("3.%d. Got %v; want %v.", i, actual[0].Value, expected.Value)
 										}
 									}
 									// #4 2sec interval sample to sample.
 									for i, expected1 := range samples {
 										if i == len(samples)-1 {
 											continue
 										}
 										expected2 := samples[i+1]
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+										actual := it.RangeValues(metric.Interval{
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+											OldestInclusive: expected1.Timestamp,
 											NewestInclusive: expected1.Timestamp + 2,
 										})
 										if len(actual) != 2 {
 											t.Fatalf("4.%d. Expected exactly 2 results, got %d.", i, len(actual))
 										}
 										if expected1.Timestamp != actual[0].Timestamp {
 											t.Errorf("4.%d. Got %v for 1st result; want %v.", i, actual[0].Timestamp, expected1.Timestamp)
 										}
 										if expected1.Value != actual[0].Value {
 											t.Errorf("4.%d. Got %v for 1st result; want %v.", i, actual[0].Value, expected1.Value)
 										}
 										if expected2.Timestamp != actual[1].Timestamp {
 											t.Errorf("4.%d. Got %v for 2nd result; want %v.", i, actual[1].Timestamp, expected2.Timestamp)
 										}
 										if expected2.Value != actual[1].Value {
 											t.Errorf("4.%d. Got %v for 2nd result; want %v.", i, actual[1].Value, expected2.Value)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+										}
 									}
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
 									// #5 corner cases: Interval ends at first sample, interval starts
 									// at last sample, interval entirely before/after samples.
 									expected := samples[0]
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+									actual := it.RangeValues(metric.Interval{
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										OldestInclusive: expected.Timestamp - 2,
 										NewestInclusive: expected.Timestamp,
 									})
 									if len(actual) != 1 {
 										t.Fatalf("5.1. Expected exactly one result, got %d.", len(actual))
 									}
 									if expected.Timestamp != actual[0].Timestamp {
 										t.Errorf("5.1. Got %v; want %v.", actual[0].Timestamp, expected.Timestamp)
 									}
 									if expected.Value != actual[0].Value {
 										t.Errorf("5.1. Got %v; want %v.", actual[0].Value, expected.Value)
 									}
 									expected = samples[len(samples)-1]
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+									actual = it.RangeValues(metric.Interval{
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										OldestInclusive: expected.Timestamp,
 										NewestInclusive: expected.Timestamp + 2,
 									})
 									if len(actual) != 1 {
 										t.Fatalf("5.2. Expected exactly one result, got %d.", len(actual))
 									}
 									if expected.Timestamp != actual[0].Timestamp {
 										t.Errorf("5.2. Got %v; want %v.", actual[0].Timestamp, expected.Timestamp)
 									}
 									if expected.Value != actual[0].Value {
 										t.Errorf("5.2. Got %v; want %v.", actual[0].Value, expected.Value)
 									}
 									firstSample := samples[0]
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+									actual = it.RangeValues(metric.Interval{
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										OldestInclusive: firstSample.Timestamp - 4,
 										NewestInclusive: firstSample.Timestamp - 2,
 									})
 									if len(actual) != 0 {
 										t.Fatalf("5.3. Expected no results, got %d.", len(actual))
 									}
 									lastSample := samples[len(samples)-1]
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+									actual = it.RangeValues(metric.Interval{
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										OldestInclusive: lastSample.Timestamp + 2,
 										NewestInclusive: lastSample.Timestamp + 4,
 									})
 									if len(actual) != 0 {
 										t.Fatalf("5.3. Expected no results, got %d.", len(actual))
 									}
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+								}
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+								func TestRangeValuesChunkType0(t *testing.T) {
 									testRangeValues(t, 0)
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+								}
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+								func TestRangeValuesChunkType1(t *testing.T) {
 									testRangeValues(t, 1)
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+								}
-												Implement Gorilla-inspired chunk encoding

This is not a verbatim implementation of the Gorilla encoding.  First
of all, it could not, even if we wanted, because Prometheus has a
different chunking model (constant size, not constant time).  Second,
this adds a number of changes that improve the encoding in general or
at least for the specific use case of Prometheus (and are partially
only possible in the context of Prometheus). See comments in the code
for details.

											
										
										
											2016-03-12 12:34:51 -08:00
+								func TestRangeValuesChunkType2(t *testing.T) {
 									testRangeValues(t, 2)
 								}
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+								func benchmarkRangeValues(b *testing.B, encoding chunk.Encoding) {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									samples := make(model.Samples, 10000)
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+									for i := range samples {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										samples[i] = &model.Sample{
 											Timestamp: model.Time(2 * i),
 											Value:     model.SampleValue(float64(i) * 0.2),
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+										}
 									}
 									s, closer := NewTestStorage(b, encoding)
 									defer closer.Close()
 									for _, sample := range samples {
 										s.Append(sample)
 									}
 									s.WaitForIndexing()
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									fp := model.Metric{}.FastFingerprint()
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+									it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), model.Earliest, model.Latest)
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+									b.ResetTimer()
 									for i := 0; i < b.N; i++ {
 										for _, sample := range samples {
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+											actual := it.RangeValues(metric.Interval{
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+												OldestInclusive: sample.Timestamp - 20,
 												NewestInclusive: sample.Timestamp + 20,
 											})
 											if len(actual) < 10 {
 												b.Fatalf("not enough samples found")
 											}
 										}
 									}
 								}
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+								func BenchmarkRangeValuesChunkType0(b *testing.B) {
 									benchmarkRangeValues(b, 0)
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+								}
-												Weed out all the [Gg]et* method names.

The only exception is getNumChunksToPersist to avoid naming the struct
member numChunksToPersist in a weird way.

											
										
										
											2015-05-20 10:13:06 -07:00
+								func BenchmarkRangeValuesChunkType1(b *testing.B) {
 									benchmarkRangeValues(b, 1)
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+								}
-												Implement Gorilla-inspired chunk encoding

This is not a verbatim implementation of the Gorilla encoding.  First
of all, it could not, even if we wanted, because Prometheus has a
different chunking model (constant size, not constant time).  Second,
this adds a number of changes that improve the encoding in general or
at least for the specific use case of Prometheus (and are partially
only possible in the context of Prometheus). See comments in the code
for details.

											
										
										
											2016-03-12 12:34:51 -08:00
+								func BenchmarkRangeValuesChunkType2(b *testing.B) {
 									benchmarkRangeValues(b, 2)
 								}
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+								func testEvictAndPurgeSeries(t *testing.T, encoding chunk.Encoding) {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									samples := make(model.Samples, 10000)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									for i := range samples {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										samples[i] = &model.Sample{
 											Timestamp: model.Time(2 * i),
 											Value:     model.SampleValue(float64(i * i)),
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+										}
 									}
-												Improve various things around chunk encoding.

A number of mostly minor things:

- Rename chunk type -> chunk encoding.

- After all, do not carry around the chunk encoding to all parts of
  the system, but just have one place where the encoding for new
  chunks is set based on the flag. The new approach has caveats as
  well, but the polution of so many method signatures is worse.

- Use the default chunk encoding for new chunks of existing
  series. (Previously, only new _series_ would get chunks with the
  default encoding.)

- Use an enum for chunk encoding. (But keep the version number for the
  flag, for reasons discussed previously.)

- Add encoding() to the chunk interface (so that a chunk knows its own
  encoding - no need to have that in a different top-level function).

- Got rid of newFollowUpChunk (which would keep the existing encoding
  for all chunks of a time series). Now only use newChunk(), which
  will create a chunk encoding according to the flag.

- Simplified transcodeAndAdd.

- Reordered methods of deltaEncodedChunk and doubleDeltaEncoded chunk
  to match the order in the chunk interface.

- Only transcode if the chunk is not yet half full. If more than half
  full, add a new chunk instead.

											
										
										
											2015-03-13 07:49:07 -07:00
+									s, closer := NewTestStorage(t, encoding)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									defer closer.Close()
-												Remove the sample ingestion channel.

The one central sample ingestion channel has caused a variety of
trouble. This commit removes it. Targets and rule evaluation call an
Append method directly now. To incorporate multiple storage backends
(like OpenTSDB), storage.Tee forks the Append into two different
appenders.

Note that the tsdb queue manager had its own queue anyway. It was a
queue after a queue... Much queue, so overhead...

Targets have their own little buffer (implemented as a channel) to
avoid stalling during an http scrape. But a new scrape will only be
started once the old one is fully ingested.

The contraption of three pipelined ingesters was removed. A Target is
an ingester itself now. Despite more logic in Target, things should be
less confusing now.

Also, remove lint and vet warnings in ast.go.

											
										
										
											2015-03-14 19:36:15 -07:00
+									for _, sample := range samples {
 										s.Append(sample)
 									}
-												Improve performance of ingestion.

- Parallelize AppendSamples as much as possible without breaking the
  contract about temporal order.

- Allocate more fingerprint locker slots.

- Do not run early checkpoints if we are behind on chunk persistence.

- Increase fpMinWaitDuration to give the disk more time for more
  important things.

Also, switch math.MaxInt64 and math.MinInt64 to the new constants.

											
										
										
											2015-02-12 08:23:42 -08:00
+									s.WaitForIndexing()
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									fp := model.Metric{}.FastFingerprint()
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
-												Fix the embarrassing bug introduced in commit 0851945.

In that commit, the 'maintainSeries' call was accidentally removed.

This commit refactors things a bit so that there is now a clean
'maintainMemorySeries' and a 'maintainArchivedSeries' call.

Straighten the nomenclature a bit (consistently use 'drop' for
chunks and 'purge' for series/metrics).

Remove the annoying 'Completed maintenance sweep through archived
fingerprints' message if there were no archived fingerprints to do
maintenance on.

											
										
										
											2015-02-26 06:19:44 -08:00
+									// Drop ~half of the chunks.
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+									s.maintainMemorySeries(fp, 10000)
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+									it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), model.Earliest, model.Latest)
-												Improve predict_linear

Fixes https://github.com/prometheus/prometheus/issues/1401

This remove the last (and in fact bogus) use of BoundaryValues.

Thus, a whole lot of unused (and arguably sub-optimal / ugly) code can
be removed here, too.

											
										
										
											2016-02-24 08:16:24 -08:00
+									actual := it.RangeValues(metric.Interval{
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+										OldestInclusive: 0,
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+										NewestInclusive: 100000,
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									})
-												Improve predict_linear

Fixes https://github.com/prometheus/prometheus/issues/1401

This remove the last (and in fact bogus) use of BoundaryValues.

Thus, a whole lot of unused (and arguably sub-optimal / ugly) code can
be removed here, too.

											
										
										
											2016-02-24 08:16:24 -08:00
+									if len(actual) < 4000 {
 										t.Fatalf("expected more than %d results after purging half of series, got %d", 4000, len(actual))
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									}
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+									if actual[0].Timestamp < 6000 || actual[0].Timestamp > 10000 {
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+										t.Errorf("1st timestamp out of expected range: %v", actual[0].Timestamp)
 									}
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									want := model.Time(19998)
-												Improve predict_linear

Fixes https://github.com/prometheus/prometheus/issues/1401

This remove the last (and in fact bogus) use of BoundaryValues.

Thus, a whole lot of unused (and arguably sub-optimal / ugly) code can
be removed here, too.

											
										
										
											2016-02-24 08:16:24 -08:00
+									if actual[len(actual)-1].Timestamp != want {
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+										t.Errorf("2nd timestamp: want %v, got %v", want, actual[1].Timestamp)
 									}
-												Fix the embarrassing bug introduced in commit 0851945.

In that commit, the 'maintainSeries' call was accidentally removed.

This commit refactors things a bit so that there is now a clean
'maintainMemorySeries' and a 'maintainArchivedSeries' call.

Straighten the nomenclature a bit (consistently use 'drop' for
chunks and 'purge' for series/metrics).

Remove the annoying 'Completed maintenance sweep through archived
fingerprints' message if there were no archived fingerprints to do
maintenance on.

											
										
										
											2015-02-26 06:19:44 -08:00
+									// Drop everything.
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+									s.maintainMemorySeries(fp, 100000)
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+									it = s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), model.Earliest, model.Latest)
-												Improve predict_linear

Fixes https://github.com/prometheus/prometheus/issues/1401

This remove the last (and in fact bogus) use of BoundaryValues.

Thus, a whole lot of unused (and arguably sub-optimal / ugly) code can
be removed here, too.

											
										
										
											2016-02-24 08:16:24 -08:00
+									actual = it.RangeValues(metric.Interval{
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+										OldestInclusive: 0,
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+										NewestInclusive: 100000,
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									})
 									if len(actual) != 0 {
 										t.Fatal("expected zero results after purging the whole series")
 									}
 									// Recreate series.
-												Remove the sample ingestion channel.

The one central sample ingestion channel has caused a variety of
trouble. This commit removes it. Targets and rule evaluation call an
Append method directly now. To incorporate multiple storage backends
(like OpenTSDB), storage.Tee forks the Append into two different
appenders.

Note that the tsdb queue manager had its own queue anyway. It was a
queue after a queue... Much queue, so overhead...

Targets have their own little buffer (implemented as a channel) to
avoid stalling during an http scrape. But a new scrape will only be
started once the old one is fully ingested.

The contraption of three pipelined ingesters was removed. A Target is
an ingester itself now. Despite more logic in Target, things should be
less confusing now.

Also, remove lint and vet warnings in ast.go.

											
										
										
											2015-03-14 19:36:15 -07:00
+									for _, sample := range samples {
 										s.Append(sample)
 									}
-												Improve performance of ingestion.

- Parallelize AppendSamples as much as possible without breaking the
  contract about temporal order.

- Allocate more fingerprint locker slots.

- Do not run early checkpoints if we are behind on chunk persistence.

- Increase fpMinWaitDuration to give the disk more time for more
  important things.

Also, switch math.MaxInt64 and math.MinInt64 to the new constants.

											
										
										
											2015-02-12 08:23:42 -08:00
+									s.WaitForIndexing()
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
-												Handle fingerprint collisions.

											
										
										
											2015-05-06 07:53:12 -07:00
+									series, ok := s.fpToSeries.get(fp)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									if !ok {
 										t.Fatal("could not find series")
 									}
-												Evict based on memory pressure. Evict recently used chunks last.

Change-Id: Ie6168f0cdb3917bdc63b6fe15585dd70c1e42afe

											
										
										
											2014-11-13 11:50:25 -08:00
+									// Persist head chunk so we can safely archive.
-												Redesign series maintenance and chunk persistence.

											
										
										
											2015-03-08 18:33:10 -07:00
+									series.headChunkClosed = true
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									s.maintainMemorySeries(fp, model.Earliest)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
-												Evict based on memory pressure. Evict recently used chunks last.

Change-Id: Ie6168f0cdb3917bdc63b6fe15585dd70c1e42afe

											
										
										
											2014-11-13 11:50:25 -08:00
+									// Archive metrics.
-												Handle fingerprint collisions.

											
										
										
											2015-05-06 07:53:12 -07:00
+									s.fpToSeries.del(fp)
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+									lastTime, err := series.head().LastTime()
-												Handle errors caused by data corruption more gracefully

This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...

The ideas behind this are the following:

- panic only if it's a programming error. Data corruptions happen, and
  they are not programming errors.

- If we detect a data corruption, we "quarantine" the series,
  essentially removing it from the database and putting its data into
  a separate directory for forensics.

- Failure during writing to a series file is not considered corruption
  automatically. It will call setDirty, though, so that a
  crashrecovery upon the next restart will commence and check for
  that.

- Series quarantining and setDirty calls are logged and counted in
  metrics, but are hidden from the user of the interfaces in
  interface.go, whith the notable exception of Append(). The reasoning
  is that we treat corruption by removing the corrupted series, i.e. a
  query for it will return no results on its next call anyway, so
  return no results right now. In the case of Append(), we want to
  tell the user that no data has been appended, though.

Minor side effects:

- Now consistently using filepath.* instead of path.*.

- Introduced structured logging where I touched it. This makes things
  less consistent, but a complete change to structured logging would
  be out of scope for this PR.

											
										
										
											2016-02-25 03:23:42 -08:00
+									if err != nil {
 										t.Fatal(err)
 									}
-												Fix accidental publishing of memorySeries.firstTime()

											
										
										
											2016-09-26 04:06:06 -07:00
+									s.persistence.archiveMetric(fp, series.metric, series.firstTime(), lastTime)
-												Clean up error propagation

Only return an error where callers are doing something with it except
simply logging and ignoring.

All the errors touched in this commit flag the storage as dirty
anyway, and that fact is logged anyway. So most of what is being
removed here is just log spam.

As discussed earlier, the class of errors that flags the storage as
dirty signals fundamental corruption, no even bubbling up a one-time
warning to the user (e.g. about incomplete results) isn't helping much
because _anything_ happening in the storage has to be doubted from
that point on (and in fact retroactively into the past, too). Flagging
the storage dirty, and alerting on it (plus marking the state in the
web UI) is the only way I can see right now.

As a byproduct, I cleaned up the setDirty method a bit and improved
the logged errors.

											
										
										
											2016-03-09 09:56:30 -08:00
+									archived, _, _ := s.persistence.hasArchivedMetric(fp)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									if !archived {
 										t.Fatal("not archived")
 									}
-												Fix the embarrassing bug introduced in commit 0851945.

In that commit, the 'maintainSeries' call was accidentally removed.

This commit refactors things a bit so that there is now a clean
'maintainMemorySeries' and a 'maintainArchivedSeries' call.

Straighten the nomenclature a bit (consistently use 'drop' for
chunks and 'purge' for series/metrics).

Remove the annoying 'Completed maintenance sweep through archived
fingerprints' message if there were no archived fingerprints to do
maintenance on.

											
										
										
											2015-02-26 06:19:44 -08:00
+									// Drop ~half of the chunks of an archived series.
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+									s.maintainArchivedSeries(fp, 10000)
-												Clean up error propagation

Only return an error where callers are doing something with it except
simply logging and ignoring.

All the errors touched in this commit flag the storage as dirty
anyway, and that fact is logged anyway. So most of what is being
removed here is just log spam.

As discussed earlier, the class of errors that flags the storage as
dirty signals fundamental corruption, no even bubbling up a one-time
warning to the user (e.g. about incomplete results) isn't helping much
because _anything_ happening in the storage has to be doubted from
that point on (and in fact retroactively into the past, too). Flagging
the storage dirty, and alerting on it (plus marking the state in the
web UI) is the only way I can see right now.

As a byproduct, I cleaned up the setDirty method a bit and improved
the logged errors.

											
										
										
											2016-03-09 09:56:30 -08:00
+									archived, _, _ = s.persistence.hasArchivedMetric(fp)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									if !archived {
-												Fix the embarrassing bug introduced in commit 0851945.

In that commit, the 'maintainSeries' call was accidentally removed.

This commit refactors things a bit so that there is now a clean
'maintainMemorySeries' and a 'maintainArchivedSeries' call.

Straighten the nomenclature a bit (consistently use 'drop' for
chunks and 'purge' for series/metrics).

Remove the annoying 'Completed maintenance sweep through archived
fingerprints' message if there were no archived fingerprints to do
maintenance on.

											
										
										
											2015-02-26 06:19:44 -08:00
+										t.Fatal("archived series purged although only half of the chunks dropped")
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									}
-												Fix the embarrassing bug introduced in commit 0851945.

In that commit, the 'maintainSeries' call was accidentally removed.

This commit refactors things a bit so that there is now a clean
'maintainMemorySeries' and a 'maintainArchivedSeries' call.

Straighten the nomenclature a bit (consistently use 'drop' for
chunks and 'purge' for series/metrics).

Remove the annoying 'Completed maintenance sweep through archived
fingerprints' message if there were no archived fingerprints to do
maintenance on.

											
										
										
											2015-02-26 06:19:44 -08:00
+									// Drop everything.
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+									s.maintainArchivedSeries(fp, 100000)
-												Clean up error propagation

Only return an error where callers are doing something with it except
simply logging and ignoring.

All the errors touched in this commit flag the storage as dirty
anyway, and that fact is logged anyway. So most of what is being
removed here is just log spam.

As discussed earlier, the class of errors that flags the storage as
dirty signals fundamental corruption, no even bubbling up a one-time
warning to the user (e.g. about incomplete results) isn't helping much
because _anything_ happening in the storage has to be doubted from
that point on (and in fact retroactively into the past, too). Flagging
the storage dirty, and alerting on it (plus marking the state in the
web UI) is the only way I can see right now.

As a byproduct, I cleaned up the setDirty method a bit and improved
the logged errors.

											
										
										
											2016-03-09 09:56:30 -08:00
+									archived, _, _ = s.persistence.hasArchivedMetric(fp)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									if archived {
 										t.Fatal("archived series not dropped")
 									}
-												Fix the case where a series in memory has 0 chunks, but chunks on disk.

This is actually completely normal for a freshly unarchived series.

Test added to expose.

											
										
										
											2015-04-09 06:57:11 -07:00
 									// Recreate series.
 									for _, sample := range samples {
 										s.Append(sample)
 									}
 									s.WaitForIndexing()
-												Handle fingerprint collisions.

											
										
										
											2015-05-06 07:53:12 -07:00
+									series, ok = s.fpToSeries.get(fp)
-												Fix the case where a series in memory has 0 chunks, but chunks on disk.

This is actually completely normal for a freshly unarchived series.

Test added to expose.

											
										
										
											2015-04-09 06:57:11 -07:00
+									if !ok {
 										t.Fatal("could not find series")
 									}
 									// Persist head chunk so we can safely archive.
 									series.headChunkClosed = true
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									s.maintainMemorySeries(fp, model.Earliest)
-												Fix the case where a series in memory has 0 chunks, but chunks on disk.

This is actually completely normal for a freshly unarchived series.

Test added to expose.

											
										
										
											2015-04-09 06:57:11 -07:00
 									// Archive metrics.
-												Handle fingerprint collisions.

											
										
										
											2015-05-06 07:53:12 -07:00
+									s.fpToSeries.del(fp)
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+									lastTime, err = series.head().LastTime()
-												Handle errors caused by data corruption more gracefully

This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...

The ideas behind this are the following:

- panic only if it's a programming error. Data corruptions happen, and
  they are not programming errors.

- If we detect a data corruption, we "quarantine" the series,
  essentially removing it from the database and putting its data into
  a separate directory for forensics.

- Failure during writing to a series file is not considered corruption
  automatically. It will call setDirty, though, so that a
  crashrecovery upon the next restart will commence and check for
  that.

- Series quarantining and setDirty calls are logged and counted in
  metrics, but are hidden from the user of the interfaces in
  interface.go, whith the notable exception of Append(). The reasoning
  is that we treat corruption by removing the corrupted series, i.e. a
  query for it will return no results on its next call anyway, so
  return no results right now. In the case of Append(), we want to
  tell the user that no data has been appended, though.

Minor side effects:

- Now consistently using filepath.* instead of path.*.

- Introduced structured logging where I touched it. This makes things
  less consistent, but a complete change to structured logging would
  be out of scope for this PR.

											
										
										
											2016-02-25 03:23:42 -08:00
+									if err != nil {
 										t.Fatal(err)
 									}
-												Fix accidental publishing of memorySeries.firstTime()

											
										
										
											2016-09-26 04:06:06 -07:00
+									s.persistence.archiveMetric(fp, series.metric, series.firstTime(), lastTime)
-												Clean up error propagation

Only return an error where callers are doing something with it except
simply logging and ignoring.

All the errors touched in this commit flag the storage as dirty
anyway, and that fact is logged anyway. So most of what is being
removed here is just log spam.

As discussed earlier, the class of errors that flags the storage as
dirty signals fundamental corruption, no even bubbling up a one-time
warning to the user (e.g. about incomplete results) isn't helping much
because _anything_ happening in the storage has to be doubted from
that point on (and in fact retroactively into the past, too). Flagging
the storage dirty, and alerting on it (plus marking the state in the
web UI) is the only way I can see right now.

As a byproduct, I cleaned up the setDirty method a bit and improved
the logged errors.

											
										
										
											2016-03-09 09:56:30 -08:00
+									archived, _, _ = s.persistence.hasArchivedMetric(fp)
-												Fix the case where a series in memory has 0 chunks, but chunks on disk.

This is actually completely normal for a freshly unarchived series.

Test added to expose.

											
										
										
											2015-04-09 06:57:11 -07:00
+									if !archived {
 										t.Fatal("not archived")
 									}
 									// Unarchive metrics.
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									s.getOrCreateSeries(fp, model.Metric{})
-												Fix the case where a series in memory has 0 chunks, but chunks on disk.

This is actually completely normal for a freshly unarchived series.

Test added to expose.

											
										
										
											2015-04-09 06:57:11 -07:00
-												Handle fingerprint collisions.

											
										
										
											2015-05-06 07:53:12 -07:00
+									series, ok = s.fpToSeries.get(fp)
-												Fix the case where a series in memory has 0 chunks, but chunks on disk.

This is actually completely normal for a freshly unarchived series.

Test added to expose.

											
										
										
											2015-04-09 06:57:11 -07:00
+									if !ok {
 										t.Fatal("could not find series")
 									}
-												Clean up error propagation

Only return an error where callers are doing something with it except
simply logging and ignoring.

All the errors touched in this commit flag the storage as dirty
anyway, and that fact is logged anyway. So most of what is being
removed here is just log spam.

As discussed earlier, the class of errors that flags the storage as
dirty signals fundamental corruption, no even bubbling up a one-time
warning to the user (e.g. about incomplete results) isn't helping much
because _anything_ happening in the storage has to be doubted from
that point on (and in fact retroactively into the past, too). Flagging
the storage dirty, and alerting on it (plus marking the state in the
web UI) is the only way I can see right now.

As a byproduct, I cleaned up the setDirty method a bit and improved
the logged errors.

											
										
										
											2016-03-09 09:56:30 -08:00
+									archived, _, _ = s.persistence.hasArchivedMetric(fp)
-												Fix the case where a series in memory has 0 chunks, but chunks on disk.

This is actually completely normal for a freshly unarchived series.

Test added to expose.

											
										
										
											2015-04-09 06:57:11 -07:00
+									if archived {
 										t.Fatal("archived")
 									}
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+									// Set archiveHighWatermark to a low value so that we can see it increase.
 									s.archiveHighWatermark = 42
-												Fix the case where a series in memory has 0 chunks, but chunks on disk.

This is actually completely normal for a freshly unarchived series.

Test added to expose.

											
										
										
											2015-04-09 06:57:11 -07:00
+									// This will archive again, but must not drop it completely, despite the
 									// memorySeries being empty.
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+									s.maintainMemorySeries(fp, 10000)
-												Clean up error propagation

Only return an error where callers are doing something with it except
simply logging and ignoring.

All the errors touched in this commit flag the storage as dirty
anyway, and that fact is logged anyway. So most of what is being
removed here is just log spam.

As discussed earlier, the class of errors that flags the storage as
dirty signals fundamental corruption, no even bubbling up a one-time
warning to the user (e.g. about incomplete results) isn't helping much
because _anything_ happening in the storage has to be doubted from
that point on (and in fact retroactively into the past, too). Flagging
the storage dirty, and alerting on it (plus marking the state in the
web UI) is the only way I can see right now.

As a byproduct, I cleaned up the setDirty method a bit and improved
the logged errors.

											
										
										
											2016-03-09 09:56:30 -08:00
+									archived, _, _ = s.persistence.hasArchivedMetric(fp)
-												Fix the case where a series in memory has 0 chunks, but chunks on disk.

This is actually completely normal for a freshly unarchived series.

Test added to expose.

											
										
										
											2015-04-09 06:57:11 -07:00
+									if !archived {
 										t.Fatal("series purged completely")
 									}
-												Add tests for range-limited label matching

While doing so, improve getSeriesForRange.

											
										
										
											2016-03-09 11:27:50 -08:00
+									// archiveHighWatermark must have been set by maintainMemorySeries.
 									if want, got := model.Time(19998), s.archiveHighWatermark; want != got {
 										t.Errorf("want archiveHighWatermark %v, got %v", want, got)
 									}
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+								}
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+								func TestEvictAndPurgeSeriesChunkType0(t *testing.T) {
 									testEvictAndPurgeSeries(t, 0)
 								}
 								func TestEvictAndPurgeSeriesChunkType1(t *testing.T) {
 									testEvictAndPurgeSeries(t, 1)
 								}
-												Implement Gorilla-inspired chunk encoding

This is not a verbatim implementation of the Gorilla encoding.  First
of all, it could not, even if we wanted, because Prometheus has a
different chunking model (constant size, not constant time).  Second,
this adds a number of changes that improve the encoding in general or
at least for the specific use case of Prometheus (and are partially
only possible in the context of Prometheus). See comments in the code
for details.

											
										
										
											2016-03-12 12:34:51 -08:00
+								func TestEvictAndPurgeSeriesChunkType2(t *testing.T) {
 									testEvictAndPurgeSeries(t, 2)
 								}
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+								func testEvictAndLoadChunkDescs(t *testing.T, encoding chunk.Encoding) {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									samples := make(model.Samples, 10000)
-												Test chunkDesc eviction and loading

											
										
										
											2015-07-15 10:53:15 -07:00
+									for i := range samples {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										samples[i] = &model.Sample{
 											Timestamp: model.Time(2 * i),
 											Value:     model.SampleValue(float64(i * i)),
-												Test chunkDesc eviction and loading

											
										
										
											2015-07-15 10:53:15 -07:00
+										}
 									}
 									// Give last sample a timestamp of now so that the head chunk will not
 									// be closed (which would then archive the time series later as
 									// everything will get evicted).
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									samples[len(samples)-1] = &model.Sample{
 										Timestamp: model.Now(),
 										Value:     model.SampleValue(3.14),
-												Test chunkDesc eviction and loading

											
										
										
											2015-07-15 10:53:15 -07:00
+									}
-												Re-add counting of evict chunk ops and decrementing NumMemChunks

Also, modify test to expose the regression.

											
										
										
											2016-10-10 07:30:10 -07:00
+									// Sadly, chunk.NumMemChunks is a global variable. We have to reset it
 									// explicitly here.
 									atomic.StoreInt64(&chunk.NumMemChunks, 0)
-												Test chunkDesc eviction and loading

											
										
										
											2015-07-15 10:53:15 -07:00
+									s, closer := NewTestStorage(t, encoding)
 									defer closer.Close()
-												storage: Evict chunks and calculate persistence pressure based on target heap size

This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".

The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).

This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.

Answers to anticipated comments:

There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.

One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.

											
										
										
											2017-03-01 06:17:31 -08:00
+									// Adjust target heap size to lower value to see evictions.
 									s.targetHeapSize = 1000000
-												Test chunkDesc eviction and loading

											
										
										
											2015-07-15 10:53:15 -07:00
 									for _, sample := range samples {
 										s.Append(sample)
 									}
 									s.WaitForIndexing()
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									fp := model.Metric{}.FastFingerprint()
-												Test chunkDesc eviction and loading

											
										
										
											2015-07-15 10:53:15 -07:00
 									series, ok := s.fpToSeries.get(fp)
 									if !ok {
 										t.Fatal("could not find series")
 									}
 									oldLen := len(series.chunkDescs)
 									// Maintain series without any dropped chunks.
 									s.maintainMemorySeries(fp, 0)
 									// Give the evict goroutine an opportunity to run.
-												storage: Evict chunks and calculate persistence pressure based on target heap size

This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".

The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).

This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.

Answers to anticipated comments:

There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.

One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.

											
										
										
											2017-03-01 06:17:31 -08:00
+									time.Sleep(1250 * time.Millisecond)
-												Review fixups.

											
										
										
											2016-09-28 14:33:34 -07:00
+									// Maintain series again to trigger chunk.Desc eviction.
-												Test chunkDesc eviction and loading

											
										
										
											2015-07-15 10:53:15 -07:00
+									s.maintainMemorySeries(fp, 0)
 									if oldLen <= len(series.chunkDescs) {
 										t.Errorf("Expected number of chunkDescs to decrease, old number %d, current number %d.", oldLen, len(series.chunkDescs))
 									}
-												Re-add counting of evict chunk ops and decrementing NumMemChunks

Also, modify test to expose the regression.

											
										
										
											2016-10-10 07:30:10 -07:00
+									if int64(len(series.chunkDescs)) < atomic.LoadInt64(&chunk.NumMemChunks) {
 										t.Errorf("NumMemChunks is larger than number of chunk descs, number of chunk descs: %d, NumMemChunks: %d.", len(series.chunkDescs), atomic.LoadInt64(&chunk.NumMemChunks))
 									}
-												Test chunkDesc eviction and loading

											
										
										
											2015-07-15 10:53:15 -07:00
 									// Load everything back.
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+									it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), 0, 100000)
-												Test chunkDesc eviction and loading

											
										
										
											2015-07-15 10:53:15 -07:00
 									if oldLen != len(series.chunkDescs) {
 										t.Errorf("Expected number of chunkDescs to have reached old value again, old number %d, current number %d.", oldLen, len(series.chunkDescs))
 									}
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+									it.Close()
-												Test chunkDesc eviction and loading

											
										
										
											2015-07-15 10:53:15 -07:00
 									// Now maintain series with drops to make sure nothing crazy happens.
 									s.maintainMemorySeries(fp, 100000)
 									if len(series.chunkDescs) != 1 {
-												Review fixups.

											
										
										
											2016-09-28 14:33:34 -07:00
+										t.Errorf("Expected exactly one chunk.Desc left, got %d.", len(series.chunkDescs))
-												Test chunkDesc eviction and loading

											
										
										
											2015-07-15 10:53:15 -07:00
+									}
 								}
 								func TestEvictAndLoadChunkDescsType0(t *testing.T) {
 									testEvictAndLoadChunkDescs(t, 0)
 								}
 								func TestEvictAndLoadChunkDescsType1(t *testing.T) {
 									testEvictAndLoadChunkDescs(t, 1)
 								}
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+								func benchmarkAppend(b *testing.B, encoding chunk.Encoding) {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									samples := make(model.Samples, b.N)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+									for i := range samples {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										samples[i] = &model.Sample{
 											Metric: model.Metric{
 												model.MetricNameLabel: model.LabelValue(fmt.Sprintf("test_metric_%d", i%10)),
 												"label1":              model.LabelValue(fmt.Sprintf("test_metric_%d", i%10)),
 												"label2":              model.LabelValue(fmt.Sprintf("test_metric_%d", i%10)),
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+											},
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+											Timestamp: model.Time(i),
 											Value:     model.SampleValue(i),
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+										}
 									}
 									b.ResetTimer()
-												Improve various things around chunk encoding.

A number of mostly minor things:

- Rename chunk type -> chunk encoding.

- After all, do not carry around the chunk encoding to all parts of
  the system, but just have one place where the encoding for new
  chunks is set based on the flag. The new approach has caveats as
  well, but the polution of so many method signatures is worse.

- Use the default chunk encoding for new chunks of existing
  series. (Previously, only new _series_ would get chunks with the
  default encoding.)

- Use an enum for chunk encoding. (But keep the version number for the
  flag, for reasons discussed previously.)

- Add encoding() to the chunk interface (so that a chunk knows its own
  encoding - no need to have that in a different top-level function).

- Got rid of newFollowUpChunk (which would keep the existing encoding
  for all chunks of a time series). Now only use newChunk(), which
  will create a chunk encoding according to the flag.

- Simplified transcodeAndAdd.

- Reordered methods of deltaEncodedChunk and doubleDeltaEncoded chunk
  to match the order in the chunk interface.

- Only transcode if the chunk is not yet half full. If more than half
  full, add a new chunk instead.

											
										
										
											2015-03-13 07:49:07 -07:00
+									s, closer := NewTestStorage(b, encoding)
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+									defer closer.Close()
-												Remove the sample ingestion channel.

The one central sample ingestion channel has caused a variety of
trouble. This commit removes it. Targets and rule evaluation call an
Append method directly now. To incorporate multiple storage backends
(like OpenTSDB), storage.Tee forks the Append into two different
appenders.

Note that the tsdb queue manager had its own queue anyway. It was a
queue after a queue... Much queue, so overhead...

Targets have their own little buffer (implemented as a channel) to
avoid stalling during an http scrape. But a new scrape will only be
started once the old one is fully ingested.

The contraption of three pipelined ingesters was removed. A Target is
an ingester itself now. Despite more logic in Target, things should be
less confusing now.

Also, remove lint and vet warnings in ast.go.

											
										
										
											2015-03-14 19:36:15 -07:00
+									for _, sample := range samples {
 										s.Append(sample)
 									}
-												Initial experimental snapshot of next-gen storage.

Change-Id: Ifb8709960dbedd1d9f5efd88cdd359ee9fa9d26d

											
										
										
											2014-06-06 02:55:53 -07:00
+								}
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+								func BenchmarkAppendType0(b *testing.B) {
 									benchmarkAppend(b, 0)
 								}
 								func BenchmarkAppendType1(b *testing.B) {
 									benchmarkAppend(b, 1)
 								}
-												Rename Gorilla into varbit

											
										
										
											2016-03-23 08:30:41 -07:00
+								func BenchmarkAppendType2(b *testing.B) {
 									benchmarkAppend(b, 2)
 								}
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+								// Append a large number of random samples and then check if we can get them out
 								// of the storage alright.
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+								func testFuzz(t *testing.T, encoding chunk.Encoding) {
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									if testing.Short() {
 										t.Skip("Skipping test in short mode.")
 									}
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									check := func(seed int64) bool {
 										rand.Seed(seed)
-												Improve various things around chunk encoding.

A number of mostly minor things:

- Rename chunk type -> chunk encoding.

- After all, do not carry around the chunk encoding to all parts of
  the system, but just have one place where the encoding for new
  chunks is set based on the flag. The new approach has caveats as
  well, but the polution of so many method signatures is worse.

- Use the default chunk encoding for new chunks of existing
  series. (Previously, only new _series_ would get chunks with the
  default encoding.)

- Use an enum for chunk encoding. (But keep the version number for the
  flag, for reasons discussed previously.)

- Add encoding() to the chunk interface (so that a chunk knows its own
  encoding - no need to have that in a different top-level function).

- Got rid of newFollowUpChunk (which would keep the existing encoding
  for all chunks of a time series). Now only use newChunk(), which
  will create a chunk encoding according to the flag.

- Simplified transcodeAndAdd.

- Reordered methods of deltaEncodedChunk and doubleDeltaEncoded chunk
  to match the order in the chunk interface.

- Only transcode if the chunk is not yet half full. If more than half
  full, add a new chunk instead.

											
										
										
											2015-03-13 07:49:07 -07:00
+										s, c := NewTestStorage(t, encoding)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										defer c.Close()
-												Add benchmarks for series iterator methods.

											
										
										
											2015-05-19 10:12:01 -07:00
+										samples := createRandomSamples("test_fuzz", 10000)
-												Remove the sample ingestion channel.

The one central sample ingestion channel has caused a variety of
trouble. This commit removes it. Targets and rule evaluation call an
Append method directly now. To incorporate multiple storage backends
(like OpenTSDB), storage.Tee forks the Append into two different
appenders.

Note that the tsdb queue manager had its own queue anyway. It was a
queue after a queue... Much queue, so overhead...

Targets have their own little buffer (implemented as a channel) to
avoid stalling during an http scrape. But a new scrape will only be
started once the old one is fully ingested.

The contraption of three pipelined ingesters was removed. A Target is
an ingester itself now. Despite more logic in Target, things should be
less confusing now.

Also, remove lint and vet warnings in ast.go.

											
										
										
											2015-03-14 19:36:15 -07:00
+										for _, sample := range samples {
 											s.Append(sample)
 										}
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+										if !verifyStorageRandom(t, s, samples) {
 											return false
 										}
 										return verifyStorageSequential(t, s, samples)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+									}
 									if err := quick.Check(check, nil); err != nil {
 										t.Fatal(err)
 									}
 								}
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+								func TestFuzzChunkType0(t *testing.T) {
 									testFuzz(t, 0)
 								}
 								func TestFuzzChunkType1(t *testing.T) {
 									testFuzz(t, 1)
 								}
-												Implement Gorilla-inspired chunk encoding

This is not a verbatim implementation of the Gorilla encoding.  First
of all, it could not, even if we wanted, because Prometheus has a
different chunking model (constant size, not constant time).  Second,
this adds a number of changes that improve the encoding in general or
at least for the specific use case of Prometheus (and are partially
only possible in the context of Prometheus). See comments in the code
for details.

											
										
										
											2016-03-12 12:34:51 -08:00
+								func TestFuzzChunkType2(t *testing.T) {
 									testFuzz(t, 2)
 								}
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+								// benchmarkFuzz is the benchmark version of testFuzz. The storage options are
 								// set such that evictions, checkpoints, and purging will happen concurrently,
 								// too. This benchmark will have a very long runtime (up to minutes). You can
 								// use it as an actual benchmark. Run it like this:
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+								//
-												Redesign series maintenance and chunk persistence.

											
										
										
											2015-03-08 18:33:10 -07:00
+								// go test -cpu 1,2,4,8 -run=NONE -bench BenchmarkFuzzChunkType -benchmem
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+								//
 								// You can also use it as a test for races. In that case, run it like this (will
 								// make things even slower):
 								//
-												Redesign series maintenance and chunk persistence.

											
										
										
											2015-03-08 18:33:10 -07:00
+								// go test -race -cpu 8 -short -bench BenchmarkFuzzChunkType
-												storage: separate chunk package, publish more names

This is a followup to https://github.com/prometheus/prometheus/pull/2011.

This publishes more of the methods and other names of the chunk code and
moves the chunk code to its own package. There's some unavoidable
ugliness: the chunk and chunkDesc metrics are used by both packages, so
I had to move them to the chunk package. That isn't great, but I don't
see how to do it better without a larger redesign of everything. Same
for the evict requests and some other types.

											
										
										
											2016-09-21 14:44:27 -07:00
+								func benchmarkFuzz(b *testing.B, encoding chunk.Encoding) {
 									chunk.DefaultEncoding = encoding
-												Make floats exact again.

This should do the right thing for the old delta chunks, too.

											
										
										
											2015-03-06 07:03:03 -08:00
+									const samplesPerRun = 100000
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									rand.Seed(42)
-												Move test package to pkg/testutil

											
										
										
											2015-05-28 11:58:38 -07:00
+									directory := testutil.NewTemporaryDirectory("test_storage", b)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									defer directory.Close()
 									o := &MemorySeriesStorageOptions{
-												storage: Evict chunks and calculate persistence pressure based on target heap size

This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".

The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).

This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.

Answers to anticipated comments:

There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.

One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.

											
										
										
											2017-03-01 06:17:31 -08:00
+										TargetHeapSize:             200000,
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+										PersistenceRetentionPeriod: time.Hour,
 										PersistenceStoragePath:     directory.Path(),
-												storage: Use staleness delta as head chunk timeout

Currently, if a series stops to exist, its head chunk will be kept
open for an hour. That prevents it from being persisted. Which
prevents it from being evicted. Which prevents the series from being
archived.

Most of the time, once no sample has been added to a series within the
staleness limit, we can be pretty confident that this series will not
receive samples anymore. The whole chain as described above can be
started after 5m instead of 1h. In the relaxed case, this doesn't
change a lot as the head chunk timeout is only checked during series
maintenance, and usually, a series is only maintained every six
hours. However, there is the typical scenario where a large service is
deployed, the deoply turns out to be bad, and then it is deployed
again within minutes, and quite quickly the number of time series has
tripled. That's the point where the Prometheus server is stressed and
switches (rightfully) into rushed mode. In that mode, time series are
processed as quickly as possible, but all of that is in vein if all of
those recently ended time series cannot be persisted yet for another
hour. In that scenario, this change will help most, and it's exactly
the scenario where help is most desperately needed.

											
										
										
											2017-03-26 14:44:50 -07:00
+										HeadChunkTimeout:           5 * time.Minute,
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+										CheckpointInterval:         time.Second,
-												Increase resilience of the storage against data corruption - step 4.

Step 4: Add a configurable sync'ing of series files after modification.

											
										
										
											2015-03-19 07:41:50 -07:00
+										SyncStrategy:               Adaptive,
-												Improve handling of series file truncation

If only very few chunks are to be truncated from a very large series
file, the rewrite of the file is a lorge overhead. With this change, a
certain ratio of the file has to be dropped to make it happen. While
only causing disk overhead at about the same ratio (by default 10%),
it will cut down I/O by a lot in above scenario.

											
										
										
											2016-01-11 07:42:10 -08:00
+										MinShrinkRatio:             0.1,
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									}
-												Do not start storage processing before Start() is called.

											
										
										
											2015-05-18 10:26:28 -07:00
+									s := NewMemorySeriesStorage(o)
 									if err := s.Start(); err != nil {
 										b.Fatalf("Error starting storage: %s", err)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									}
 									s.Start()
 									defer s.Stop()
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
 									samples := createRandomSamples("benchmark_fuzz", samplesPerRun*b.N)
 									b.ResetTimer()
 									for i := 0; i < b.N; i++ {
 										start := samplesPerRun * i
 										end := samplesPerRun * (i + 1)
 										middle := (start + end) / 2
-												Remove the sample ingestion channel.

The one central sample ingestion channel has caused a variety of
trouble. This commit removes it. Targets and rule evaluation call an
Append method directly now. To incorporate multiple storage backends
(like OpenTSDB), storage.Tee forks the Append into two different
appenders.

Note that the tsdb queue manager had its own queue anyway. It was a
queue after a queue... Much queue, so overhead...

Targets have their own little buffer (implemented as a channel) to
avoid stalling during an http scrape. But a new scrape will only be
started once the old one is fully ingested.

The contraption of three pipelined ingesters was removed. A Target is
an ingester itself now. Despite more logic in Target, things should be
less confusing now.

Also, remove lint and vet warnings in ast.go.

											
										
										
											2015-03-14 19:36:15 -07:00
+										for _, sample := range samples[start:middle] {
 											s.Append(sample)
 										}
-												Separate query interface out of local.Storage.

PromQL only requires a much narrower interface than local.Storage in
order to run queries. Narrower interfaces are easier to replace and
test, too.

We could also change the web interface to use local.Querier, except that
we'll probably use appending functions from there in the future.

											
										
										
											2016-06-23 04:03:41 -07:00
+										verifyStorageRandom(b, s, samples[:middle])
-												Remove the sample ingestion channel.

The one central sample ingestion channel has caused a variety of
trouble. This commit removes it. Targets and rule evaluation call an
Append method directly now. To incorporate multiple storage backends
(like OpenTSDB), storage.Tee forks the Append into two different
appenders.

Note that the tsdb queue manager had its own queue anyway. It was a
queue after a queue... Much queue, so overhead...

Targets have their own little buffer (implemented as a channel) to
avoid stalling during an http scrape. But a new scrape will only be
started once the old one is fully ingested.

The contraption of three pipelined ingesters was removed. A Target is
an ingester itself now. Despite more logic in Target, things should be
less confusing now.

Also, remove lint and vet warnings in ast.go.

											
										
										
											2015-03-14 19:36:15 -07:00
+										for _, sample := range samples[middle:end] {
 											s.Append(sample)
 										}
-												Separate query interface out of local.Storage.

PromQL only requires a much narrower interface than local.Storage in
order to run queries. Narrower interfaces are easier to replace and
test, too.

We could also change the web interface to use local.Querier, except that
we'll probably use appending functions from there in the future.

											
										
										
											2016-06-23 04:03:41 -07:00
+										verifyStorageRandom(b, s, samples[:end])
 										verifyStorageSequential(b, s, samples)
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+									}
 								}
 								func BenchmarkFuzzChunkType0(b *testing.B) {
 									benchmarkFuzz(b, 0)
 								}
 								func BenchmarkFuzzChunkType1(b *testing.B) {
 									benchmarkFuzz(b, 1)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+								}
-												Implement Gorilla-inspired chunk encoding

This is not a verbatim implementation of the Gorilla encoding.  First
of all, it could not, even if we wanted, because Prometheus has a
different chunking model (constant size, not constant time).  Second,
this adds a number of changes that improve the encoding in general or
at least for the specific use case of Prometheus (and are partially
only possible in the context of Prometheus). See comments in the code
for details.

											
										
										
											2016-03-12 12:34:51 -08:00
+								func BenchmarkFuzzChunkType2(b *testing.B) {
 									benchmarkFuzz(b, 2)
 								}
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+								func createRandomSamples(metricName string, minLen int) model.Samples {
 									type valueCreator func() model.SampleValue
 									type deltaApplier func(model.SampleValue) model.SampleValue
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
 									var (
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+										maxMetrics      = 5
 										maxStreakLength = 2000
 										maxTimeDelta    = 10000
 										timestamp       = model.Now() - model.Time(maxTimeDelta*minLen) // So that some timestamps are in the future.
 										generators      = []struct {
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+											createValue valueCreator
 											applyDelta  []deltaApplier
 										}{
 											{ // "Boolean".
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+												createValue: func() model.SampleValue {
 													return model.SampleValue(rand.Intn(2))
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+												},
 												applyDelta: []deltaApplier{
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+													func(_ model.SampleValue) model.SampleValue {
 														return model.SampleValue(rand.Intn(2))
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+													},
 												},
 											},
 											{ // Integer with int deltas of various byte length.
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+												createValue: func() model.SampleValue {
 													return model.SampleValue(rand.Int63() - 1<<62)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+												},
 												applyDelta: []deltaApplier{
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+													func(v model.SampleValue) model.SampleValue {
 														return model.SampleValue(rand.Intn(1<<8) - 1<<7 + int(v))
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+													},
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+													func(v model.SampleValue) model.SampleValue {
 														return model.SampleValue(rand.Intn(1<<16) - 1<<15 + int(v))
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+													},
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+													func(v model.SampleValue) model.SampleValue {
 														return model.SampleValue(rand.Int63n(1<<32) - 1<<31 + int64(v))
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+													},
 												},
 											},
 											{ // Float with float32 and float64 deltas.
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+												createValue: func() model.SampleValue {
 													return model.SampleValue(rand.NormFloat64())
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+												},
 												applyDelta: []deltaApplier{
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+													func(v model.SampleValue) model.SampleValue {
 														return v + model.SampleValue(float32(rand.NormFloat64()))
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+													},
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+													func(v model.SampleValue) model.SampleValue {
 														return v + model.SampleValue(rand.NormFloat64())
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+													},
 												},
 											},
 										}
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+										timestampIncrementers = []func(baseDelta model.Time) model.Time{
 											// Regular increments.
 											func(delta model.Time) model.Time {
 												return delta
 											},
 											// Jittered increments. σ is 1/100 of delta, e.g. 10ms for 10s scrape interval.
 											func(delta model.Time) model.Time {
 												return delta + model.Time(rand.NormFloat64()*float64(delta)/100)
 											},
 											// Regular increments, but missing a scrape with 10% chance.
 											func(delta model.Time) model.Time {
 												i := rand.Intn(100)
 												if i < 90 {
 													return delta
 												}
 												if i < 99 {
 													return 2 * delta
 												}
 												return 3 * delta
 												// Ignoring the case with more than two missed scrapes in a row.
 											},
 										}
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+									)
-												Fix typo.

											
										
										
											2015-05-11 08:15:30 -07:00
+									// Prefill result with two samples with colliding metrics (to test fingerprint mapping).
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									result := model.Samples{
 										&model.Sample{
 											Metric: model.Metric{
-												Handle fingerprint collisions.

											
										
										
											2015-05-06 07:53:12 -07:00
+												"instance": "ip-10-33-84-73.l05.ams5.s-cloud.net:24483",
 												"status":   "503",
 											},
 											Value:     42,
 											Timestamp: timestamp,
 										},
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										&model.Sample{
 											Metric: model.Metric{
-												Handle fingerprint collisions.

											
										
										
											2015-05-06 07:53:12 -07:00
+												"instance": "ip-10-33-84-73.l05.ams5.s-cloud.net:24480",
 												"status":   "500",
 											},
 											Value:     2010,
 											Timestamp: timestamp + 1,
 										},
 									}
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									metrics := []model.Metric{}
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									for n := rand.Intn(maxMetrics); n >= 0; n-- {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+										metrics = append(metrics, model.Metric{
 											model.MetricNameLabel:                             model.LabelValue(metricName),
 											model.LabelName(fmt.Sprintf("labelname_%d", n+1)): model.LabelValue(fmt.Sprintf("labelvalue_%d", rand.Int())),
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										})
 									}
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+									for len(result) < minLen {
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+										var (
 											// Pick a metric for this cycle.
 											metric       = metrics[rand.Intn(len(metrics))]
 											timeDelta    = model.Time(rand.Intn(maxTimeDelta) + 1)
 											generator    = generators[rand.Intn(len(generators))]
 											createValue  = generator.createValue
 											applyDelta   = generator.applyDelta[rand.Intn(len(generator.applyDelta))]
 											incTimestamp = timestampIncrementers[rand.Intn(len(timestampIncrementers))]
 										)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+										switch rand.Intn(4) {
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										case 0: // A single sample.
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+											result = append(result, &model.Sample{
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+												Metric:    metric,
 												Value:     createValue(),
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+												Timestamp: timestamp,
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+											})
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+											timestamp += incTimestamp(timeDelta)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										case 1: // A streak of random sample values.
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+											for n := rand.Intn(maxStreakLength); n >= 0; n-- {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+												result = append(result, &model.Sample{
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+													Metric:    metric,
 													Value:     createValue(),
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+													Timestamp: timestamp,
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+												})
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+												timestamp += incTimestamp(timeDelta)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+											}
 										case 2: // A streak of sample values with incremental changes.
 											value := createValue()
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+											for n := rand.Intn(maxStreakLength); n >= 0; n-- {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+												result = append(result, &model.Sample{
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+													Metric:    metric,
 													Value:     value,
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+													Timestamp: timestamp,
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+												})
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+												timestamp += incTimestamp(timeDelta)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+												value = applyDelta(value)
 											}
 										case 3: // A streak of constant sample values.
 											value := createValue()
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+											for n := rand.Intn(maxStreakLength); n >= 0; n-- {
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+												result = append(result, &model.Sample{
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+													Metric:    metric,
 													Value:     value,
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+													Timestamp: timestamp,
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+												})
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+												timestamp += incTimestamp(timeDelta)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+											}
 										}
 									}
 									return result
 								}
-												storage: Make MemorySeriesStorage a public type

See https://twitter.com/fabxc/status/748032597876482048

											
										
										
											2016-06-28 23:14:23 -07:00
+								func verifyStorageRandom(t testing.TB, s *MemorySeriesStorage, samples model.Samples) bool {
-												Implement double-delta encoded chunks.

											
										
										
											2015-03-04 04:40:18 -08:00
+									s.WaitForIndexing()
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+									result := true
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+									for _, i := range rand.Perm(len(samples)) {
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										sample := samples[i]
-												Checkpoint fingerprint mappings only upon shutdown

Before, we checkpointed after every newly detected fingerprint
collision, which is not a problem as long as collisions are
rare. However, with a sufficient number of metrics or particular
nature of the data set, there might be a lot of collisions, all to be
detected upon the first set of scrapes, and then the checkpointing
after each detection will take a quite long time (it's O(n²),
essentially).

Since we are rebuilding the fingerprint mapping during crash recovery,
the previous, very conservative approach didn't even buy us
anything. We only ever read from the checkpoint file after a clean
shutdown, so the only time we need to write the checkpoint file is
during a clean shutdown.

											
										
										
											2016-04-14 07:02:37 -07:00
+										fp := s.mapper.mapFP(sample.Metric.FastFingerprint(), sample.Metric)
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+										it := s.preloadChunksForInstant(makeFingerprintSeriesPair(s, fp), sample.Timestamp, sample.Timestamp)
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+										found := it.ValueAtOrBeforeTime(sample.Timestamp)
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+										startTime := it.(*boundedIterator).start
 										switch {
 										case found.Timestamp != model.Earliest && sample.Timestamp.Before(startTime):
 											t.Errorf("Sample #%d %#v: Expected outdated sample to be excluded.", i, sample)
 											result = false
 										case found.Timestamp == model.Earliest && !sample.Timestamp.Before(startTime):
-												Implement Gorilla-inspired chunk encoding

This is not a verbatim implementation of the Gorilla encoding.  First
of all, it could not, even if we wanted, because Prometheus has a
different chunking model (constant size, not constant time).  Second,
this adds a number of changes that improve the encoding in general or
at least for the specific use case of Prometheus (and are partially
only possible in the context of Prometheus). See comments in the code
for details.

											
										
										
											2016-03-12 12:34:51 -08:00
+											t.Errorf("Sample #%d %#v: Expected sample not found.", i, sample)
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+											result = false
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+										case found.Timestamp == model.Earliest && sample.Timestamp.Before(startTime):
 											// All good. Outdated sample dropped.
 										case sample.Value != found.Value || sample.Timestamp != found.Timestamp:
 											t.Errorf(
 												"Sample #%d %#v: Value (or timestamp) mismatch, want %f (at time %v), got %f (at time %v).",
 												i, sample, sample.Value, sample.Timestamp, found.Value, found.Timestamp,
 											)
 											result = false
 										}
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+										it.Close()
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+									}
 									return result
 								}
-												storage: Make MemorySeriesStorage a public type

See https://twitter.com/fabxc/status/748032597876482048

											
										
										
											2016-06-28 23:14:23 -07:00
+								func verifyStorageSequential(t testing.TB, s *MemorySeriesStorage, samples model.Samples) bool {
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+									s.WaitForIndexing()
 									var (
 										result = true
 										fp     model.Fingerprint
 										it     SeriesIterator
 										r      []model.SamplePair
 										j      int
 									)
 									defer func() {
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+										it.Close()
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+									}()
 									for i, sample := range samples {
-												Checkpoint fingerprint mappings only upon shutdown

Before, we checkpointed after every newly detected fingerprint
collision, which is not a problem as long as collisions are
rare. However, with a sufficient number of metrics or particular
nature of the data set, there might be a lot of collisions, all to be
detected upon the first set of scrapes, and then the checkpointing
after each detection will take a quite long time (it's O(n²),
essentially).

Since we are rebuilding the fingerprint mapping during crash recovery,
the previous, very conservative approach didn't even buy us
anything. We only ever read from the checkpoint file after a clean
shutdown, so the only time we need to write the checkpoint file is
during a clean shutdown.

											
										
										
											2016-04-14 07:02:37 -07:00
+										newFP := s.mapper.mapFP(sample.Metric.FastFingerprint(), sample.Metric)
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+										if it == nil || newFP != fp {
 											fp = newFP
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+											if it != nil {
 												it.Close()
 											}
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+											it = s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), sample.Timestamp, model.Latest)
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+											r = it.RangeValues(metric.Interval{
 												OldestInclusive: sample.Timestamp,
 												NewestInclusive: model.Latest,
 											})
 											j = -1
 										}
 										startTime := it.(*boundedIterator).start
 										if sample.Timestamp.Before(startTime) {
 											continue
 										}
 										j++
 										if j >= len(r) {
 											t.Errorf(
 												"Sample #%d %v not found.",
 												i, sample,
 											)
 											result = false
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+											continue
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+										}
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+										found := r[j]
-												Streamline series iterator creation

This will fix issue #1035 and will also help to make issue #1264 less
bad.

The fundamental problem in the current code:

In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.

This commit addresses the problem from two sides:

First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.

Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.

To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.

The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.

Benchmarks:

The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.

To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.

In addition to performance improvements, this commit removes about 150
LOC.

											
										
										
											2016-02-16 09:47:50 -08:00
+										if sample.Value != found.Value || sample.Timestamp != found.Timestamp {
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+											t.Errorf(
-												Improve fuzz testing and fix a bug exposed

This improves fuzz testing in two ways:

(1) More realistic time stamps. So far, the most common case in
practice was very rare in the test: Completely regular increases of
the timestamp.

(2) Verify samples by scanning through the whole relevant section of
the series.

For Gorilla-like chunks, this showed two things:

(1) With more regularly increasing time stamps, BenchmarkFuzz is
essentially as fast as with the traditional chunks:

```
BenchmarkFuzzChunkType0-8              2         972514684 ns/op        83426196 B/op    2500044 allocs/op
BenchmarkFuzzChunkType1-8              2         971478001 ns/op        82874660 B/op    2512364 allocs/op
BenchmarkFuzzChunkType2-8              2         999339453 ns/op        76670636 B/op    2366116 allocs/op
```

(2) There was a bug related to when and how the chunk footer is
overwritten to make use for the last sample. This wasn't exposed by
random access as the last sample of a chunk is retrieved from the
values in the header in that case.

											
										
										
											2016-03-20 09:14:47 -07:00
+												"Sample #%d %v: Value (or timestamp) mismatch, want %f (at time %v), got %f (at time %v).",
-												Implement Gorilla-inspired chunk encoding

This is not a verbatim implementation of the Gorilla encoding.  First
of all, it could not, even if we wanted, because Prometheus has a
different chunking model (constant size, not constant time).  Second,
this adds a number of changes that improve the encoding in general or
at least for the specific use case of Prometheus (and are partially
only possible in the context of Prometheus). See comments in the code
for details.

											
										
										
											2016-03-12 12:34:51 -08:00
+												i, sample, sample.Value, sample.Timestamp, found.Value, found.Timestamp,
-												Add more tests.

Add an end-to-end fuzz and race test.

Fix a race exposed by the above.

Change-Id: Ifaa39a90cefbde8d4c29bda197cc92592ded21bb

											
										
										
											2014-10-28 11:01:41 -07:00
+											)
-												Improve testing.

In particular, create a fuzz test for time series.

Change-Id: I523a17912405a0b6b46bd395c781d201dfe55036

											
										
										
											2014-08-14 09:23:49 -07:00
+											result = false
 										}
 									}
 									return result
 								}
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
 								func TestAppendOutOfOrder(t *testing.T) {
-												Switch chunk encoding to type 2 where it was hardcoded type 1 before

The chunk encoding was hardcoded there because it mostly doesn't
matter what encoding is chosen in that test. Since type 1 is
battle-hardened enough, I'm switching to type 2 here so that we can
catch unexpected problems as a byproduct. My expectation is that the
chunk encoding doesn't matter anyway, as said, but then "unexpected
problems" contains the word "unexpected".

											
										
										
											2016-03-20 15:32:20 -07:00
+									s, closer := NewTestStorage(t, 2)
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
+									defer closer.Close()
-												Switch from client_golang/model to common/model

											
										
										
											2015-08-20 08:18:46 -07:00
+									m := model.Metric{
 										model.MetricNameLabel: "out_of_order",
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
+									}
-												Improve TestAppendOutOfOrder

It did not test the returned error so far.
Also, add tests for the NaN case broken before
https://github.com/prometheus/common/pull/40

											
										
										
											2016-05-20 04:46:33 -07:00
+									tests := []struct {
 										name      string
 										timestamp model.Time
 										value     model.SampleValue
 										wantErr   error
 									}{
 										{
 											name:      "1st sample",
 											timestamp: 0,
 											value:     0,
 											wantErr:   nil,
 										},
 										{
 											name:      "regular append",
 											timestamp: 2,
 											value:     1,
 											wantErr:   nil,
 										},
 										{
 											name:      "same timestamp, same value (no-op)",
 											timestamp: 2,
 											value:     1,
 											wantErr:   nil,
 										},
 										{
 											name:      "same timestamp, different value",
 											timestamp: 2,
 											value:     2,
 											wantErr:   ErrDuplicateSampleForTimestamp,
 										},
 										{
 											name:      "earlier timestamp, same value",
 											timestamp: 1,
 											value:     2,
 											wantErr:   ErrOutOfOrderSample,
 										},
 										{
 											name:      "earlier timestamp, different value",
 											timestamp: 1,
 											value:     3,
 											wantErr:   ErrOutOfOrderSample,
 										},
 										{
 											name:      "regular append of NaN",
 											timestamp: 3,
 											value:     model.SampleValue(math.NaN()),
 											wantErr:   nil,
 										},
 										{
 											name:      "no-op append of NaN",
 											timestamp: 3,
 											value:     model.SampleValue(math.NaN()),
 											wantErr:   nil,
 										},
 										{
 											name:      "append of NaN with earlier timestamp",
 											timestamp: 2,
 											value:     model.SampleValue(math.NaN()),
 											wantErr:   ErrOutOfOrderSample,
 										},
 										{
 											name:      "append of normal sample after NaN with same timestamp",
 											timestamp: 3,
 											value:     3.14,
 											wantErr:   ErrDuplicateSampleForTimestamp,
 										},
 									}
 									for _, test := range tests {
 										gotErr := s.Append(&model.Sample{
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
+											Metric:    m,
-												Improve TestAppendOutOfOrder

It did not test the returned error so far.
Also, add tests for the NaN case broken before
https://github.com/prometheus/common/pull/40

											
										
										
											2016-05-20 04:46:33 -07:00
+											Timestamp: test.timestamp,
 											Value:     test.value,
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
+										})
-												Improve TestAppendOutOfOrder

It did not test the returned error so far.
Also, add tests for the NaN case broken before
https://github.com/prometheus/common/pull/40

											
										
										
											2016-05-20 04:46:33 -07:00
+										if gotErr != test.wantErr {
 											t.Errorf("%s: got %q, want %q", test.name, gotErr, test.wantErr)
 										}
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
+									}
-												Checkpoint fingerprint mappings only upon shutdown

Before, we checkpointed after every newly detected fingerprint
collision, which is not a problem as long as collisions are
rare. However, with a sufficient number of metrics or particular
nature of the data set, there might be a lot of collisions, all to be
detected upon the first set of scrapes, and then the checkpointing
after each detection will take a quite long time (it's O(n²),
essentially).

Since we are rebuilding the fingerprint mapping during crash recovery,
the previous, very conservative approach didn't even buy us
anything. We only ever read from the checkpoint file after a clean
shutdown, so the only time we need to write the checkpoint file is
during a clean shutdown.

											
										
										
											2016-04-14 07:02:37 -07:00
+									fp := s.mapper.mapFP(m.FastFingerprint(), m)
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
-												New fpsForLabelMatchers and seriesForLabelMatchers methods

These more specific methods have replaced `metricForLabelMatchers`
in cases where its  `map[fingerprint]metric` result type was
not necessary or was used as an intermediate step

Avoids duplicated calls to `seriesForRange` from
`QueryRange` and `QueryInstant` methods.

											
										
										
											2016-09-18 04:20:46 -07:00
+									it := s.preloadChunksForRange(makeFingerprintSeriesPair(s, fp), 0, 2)
-												Make the storage interface higher-level.

See discussion in
https://groups.google.com/forum/#!topic/prometheus-developers/bkuGbVlvQ9g

The main idea is that the user of a storage shouldn't have to deal with
fingerprints anymore, and should not need to do an individual preload
call for each metric. The storage interface needs to be made more
high-level to not expose these details.

This also makes it easier to reuse the same storage interface for remote
storages later, as fewer roundtrips are required and the fingerprint
concept doesn't work well across the network.

NOTE: this deliberately gets rid of a small optimization in the old
query Analyzer, where we dedupe instants and ranges for the same series.
This should have a minor impact, as most queries do not have multiple
selectors loading the same series (and at the same offset).

											
										
										
											2016-07-11 11:27:25 -07:00
+									defer it.Close()
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
-												Replace metric.SamplePair with model.SamplePair

											
										
										
											2015-08-22 05:52:35 -07:00
+									want := []model.SamplePair{
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
+										{
 											Timestamp: 0,
 											Value:     0,
 										},
 										{
 											Timestamp: 2,
-												Improve TestAppendOutOfOrder

											
										
										
											2015-07-16 03:48:33 -07:00
+											Value:     1,
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
+										},
-												Improve TestAppendOutOfOrder

It did not test the returned error so far.
Also, add tests for the NaN case broken before
https://github.com/prometheus/common/pull/40

											
										
										
											2016-05-20 04:46:33 -07:00
+										{
 											Timestamp: 3,
 											Value:     model.SampleValue(math.NaN()),
 										},
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
+									}
-												Improve TestAppendOutOfOrder

It did not test the returned error so far.
Also, add tests for the NaN case broken before
https://github.com/prometheus/common/pull/40

											
										
										
											2016-05-20 04:46:33 -07:00
+									got := it.RangeValues(metric.Interval{OldestInclusive: 0, NewestInclusive: 3})
 									// Note that we cannot just reflect.DeepEqual(want, got) because it has
 									// the semantics of NaN != NaN.
 									for i, gotSamplePair := range got {
 										wantSamplePair := want[i]
 										if !wantSamplePair.Equal(&gotSamplePair) {
 											t.Fatalf("want %v, got %v", wantSamplePair, gotSamplePair)
 										}
-												storage: ensure timestamp monotonicity within series.

Fixes https://github.com/prometheus/prometheus/issues/481

While doing so, clean up and fix a few other things:

- Fix `go vet` warnings (@fabxc to blame ;).

- Fix a racey problem with unarchiving: Whenever we unarchive a
  series, we essentially want to do something with it. However, until
  we have done something with it, it appears like a series that is
  ready to be archived or even purged. So e.g. it would be ignored
  during checkpointing. With this fix, we always load the chunkDescs
  upon unarchiving. This is wasteful if we only want to add a new
  sample to an archived time series, but the (presumably more common)
  case where we access an archived time series in a query doesn't
  become more expensive.

- The change above streamlined the getOrCreateSeries ond
  newMemorySeries flow. Also, the modTime is now always set correctly.

- Fix the leveldb-backed implementation of KeyValueStore.Delete. It
  had the wrong behavior of still returning true, nil if a
  non-existing key has been passed in.

											
										
										
											2015-07-13 12:12:27 -07:00
+									}
 								}
-												storage: Evict chunks and calculate persistence pressure based on target heap size

This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".

The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).

This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.

Answers to anticipated comments:

There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.

One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.

											
										
										
											2017-03-01 06:17:31 -08:00
 								func TestCalculatePersistUrgency(t *testing.T) {
 									tests := map[string]struct {
 										persistUrgency                        int32
 										lenEvictList                          int
 										numChunksToPersist                    int64
 										targetHeapSize, msNextGC, msHeapAlloc uint64
 										msNumGC, lastNumGC                    uint32
 										wantPersistUrgency int32
 										wantChunksToEvict  int
 										wantLastNumGC      uint32
 									}{
 										"all zeros": {
 											persistUrgency:     0,
 											lenEvictList:       0,
 											numChunksToPersist: 0,
 											targetHeapSize:     0,
 											msNextGC:           0,
 											msHeapAlloc:        0,
 											msNumGC:            0,
 											lastNumGC:          0,
 											wantPersistUrgency: 0,
 											wantChunksToEvict:  0,
 											wantLastNumGC:      0,
 										},
 										"far from target heap size, plenty of chunks to persist, GC has happened": {
 											persistUrgency:     500,
 											lenEvictList:       1000,
 											numChunksToPersist: 100,
 											targetHeapSize:     1000000,
 											msNextGC:           500000,
 											msHeapAlloc:        400000,
 											msNumGC:            42,
 											lastNumGC:          41,
 											wantPersistUrgency: 45,
 											wantChunksToEvict:  0,
 											wantLastNumGC:      42,
 										},
 										"far from target heap size, plenty of chunks to persist, GC hasn't happened, urgency must not decrease": {
 											persistUrgency:     500,
 											lenEvictList:       1000,
 											numChunksToPersist: 100,
 											targetHeapSize:     1000000,
 											msNextGC:           500000,
 											msHeapAlloc:        400000,
 											msNumGC:            42,
 											lastNumGC:          42,
 											wantPersistUrgency: 500,
 											wantChunksToEvict:  0,
 											wantLastNumGC:      42,
 										},
 										"far from target heap size but no chunks to persist": {
 											persistUrgency:     50,
 											lenEvictList:       0,
 											numChunksToPersist: 100,
 											targetHeapSize:     1000000,
 											msNextGC:           500000,
 											msHeapAlloc:        400000,
 											msNumGC:            42,
 											lastNumGC:          41,
 											wantPersistUrgency: 500,
 											wantChunksToEvict:  0,
 											wantLastNumGC:      42,
 										},
 										"far from target heap size but no chunks to persist, HeapAlloc > NextGC": {
 											persistUrgency:     50,
 											lenEvictList:       0,
 											numChunksToPersist: 100,
 											targetHeapSize:     1000000,
 											msNextGC:           500000,
 											msHeapAlloc:        600000,
 											msNumGC:            42,
 											lastNumGC:          41,
 											wantPersistUrgency: 600,
 											wantChunksToEvict:  0,
 											wantLastNumGC:      42,
 										},
 										"target heap size exceeded but GC hasn't happened": {
 											persistUrgency:     50,
 											lenEvictList:       3000,
 											numChunksToPersist: 1000,
 											targetHeapSize:     1000000,
 											msNextGC:           1100000,
 											msHeapAlloc:        900000,
 											msNumGC:            42,
 											lastNumGC:          42,
 											wantPersistUrgency: 275,
 											wantChunksToEvict:  0,
 											wantLastNumGC:      42,
 										},
 										"target heap size exceeded, GC has happened": {
 											persistUrgency:     50,
 											lenEvictList:       3000,
 											numChunksToPersist: 1000,
 											targetHeapSize:     1000000,
 											msNextGC:           1100000,
 											msHeapAlloc:        900000,
 											msNumGC:            42,
 											lastNumGC:          41,
 											wantPersistUrgency: 275,
 											wantChunksToEvict:  97,
 											wantLastNumGC:      42,
 										},
 										"target heap size exceeded, GC has happened, urgency bumped due to low number of evictable chunks": {
 											persistUrgency:     50,
 											lenEvictList:       300,
 											numChunksToPersist: 100,
 											targetHeapSize:     1000000,
 											msNextGC:           1100000,
 											msHeapAlloc:        900000,
 											msNumGC:            42,
 											lastNumGC:          41,
 											wantPersistUrgency: 323,
 											wantChunksToEvict:  97,
 											wantLastNumGC:      42,
 										},
 										"target heap size exceeded but no evictable chunks and GC hasn't happened": {
 											persistUrgency:     50,
 											lenEvictList:       0,
 											numChunksToPersist: 1000,
 											targetHeapSize:     1000000,
 											msNextGC:           1100000,
 											msHeapAlloc:        900000,
 											msNumGC:            42,
 											lastNumGC:          42,
 											wantPersistUrgency: 1000,
 											wantChunksToEvict:  0,
 											wantLastNumGC:      42,
 										},
 										"target heap size exceeded but no evictable chunks and GC has happened": {
 											persistUrgency:     50,
 											lenEvictList:       0,
 											numChunksToPersist: 1000,
 											targetHeapSize:     1000000,
 											msNextGC:           1100000,
 											msHeapAlloc:        900000,
 											msNumGC:            42,
 											lastNumGC:          41,
 											wantPersistUrgency: 1000,
 											wantChunksToEvict:  0,
 											wantLastNumGC:      42,
 										},
 										"target heap size exceeded, very few evictable chunks, GC hasn't happened": {
 											persistUrgency:     50,
 											lenEvictList:       10,
 											numChunksToPersist: 1000,
 											targetHeapSize:     1000000,
 											msNextGC:           1100000,
 											msHeapAlloc:        900000,
 											msNumGC:            42,
 											lastNumGC:          42,
 											wantPersistUrgency: 1000,
 											wantChunksToEvict:  0,
 											wantLastNumGC:      42,
 										},
 										"target heap size exceeded, some evictable chunks (but not enough), GC hasn't happened": {
 											persistUrgency:     50,
 											lenEvictList:       50,
 											numChunksToPersist: 250,
 											targetHeapSize:     1000000,
 											msNextGC:           1100000,
 											msHeapAlloc:        900000,
 											msNumGC:            42,
 											lastNumGC:          42,
 											wantPersistUrgency: 916,
 											wantChunksToEvict:  0,
 											wantLastNumGC:      42,
 										},
 										"target heap size exceeded, some evictable chunks (but not enough), GC has happened": {
 											persistUrgency:     50,
 											lenEvictList:       50,
 											numChunksToPersist: 250,
 											targetHeapSize:     1000000,
 											msNextGC:           1100000,
 											msHeapAlloc:        900000,
 											msNumGC:            42,
 											lastNumGC:          41,
 											wantPersistUrgency: 1000,
 											wantChunksToEvict:  50,
 											wantLastNumGC:      42,
 										},
 									}
 									s, closer := NewTestStorage(t, 1)
 									defer closer.Close()
 									for scenario, test := range tests {
 										s.persistUrgency = test.persistUrgency
 										s.numChunksToPersist = test.numChunksToPersist
 										s.targetHeapSize = test.targetHeapSize
 										s.lastNumGC = test.lastNumGC
 										s.evictList.Init()
 										for i := 0; i < test.lenEvictList; i++ {
 											s.evictList.PushBack(&struct{}{})
 										}
 										ms := runtime.MemStats{
 											NextGC:    test.msNextGC,
 											HeapAlloc: test.msHeapAlloc,
 											NumGC:     test.msNumGC,
 										}
 										chunksToEvict := s.calculatePersistUrgency(&ms)
 										if chunksToEvict != test.wantChunksToEvict {
 											t.Errorf(
 												"scenario %q: got %d chunks to evict, want %d",
 												scenario, chunksToEvict, test.wantChunksToEvict,
 											)
 										}
 										if s.persistUrgency != test.wantPersistUrgency {
 											t.Errorf(
 												"scenario %q: got persist urgency %d, want %d",
 												scenario, s.persistUrgency, test.wantPersistUrgency,
 											)
 										}
 										if s.lastNumGC != test.wantLastNumGC {
 											t.Errorf(
 												"scenario %q: got lastNumGC %d , want %d",
 												scenario, s.lastNumGC, test.wantLastNumGC,
 											)
 										}
 									}
 								}