2015-01-21 11:07:45 -08:00
// Copyright 2014 The Prometheus Authors
2014-09-19 09:18:44 -07:00
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2014-09-16 06:47:24 -07:00
package local
2014-06-06 02:55:53 -07:00
import (
"fmt"
2015-06-22 03:02:03 -07:00
"hash/fnv"
2016-05-20 04:46:33 -07:00
"math"
2014-08-14 09:23:49 -07:00
"math/rand"
2015-09-11 06:47:23 -07:00
"os"
storage: Evict chunks and calculate persistence pressure based on target heap size
This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".
The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).
This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.
Answers to anticipated comments:
There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.
One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.
2017-03-01 06:17:31 -08:00
"runtime"
2016-09-18 14:58:39 -07:00
"strconv"
2016-10-10 07:30:10 -07:00
"sync/atomic"
2014-06-06 02:55:53 -07:00
"testing"
2014-08-14 09:23:49 -07:00
"testing/quick"
2014-06-06 02:55:53 -07:00
"time"
2015-10-03 01:21:43 -07:00
"github.com/prometheus/common/log"
2015-08-20 08:18:46 -07:00
"github.com/prometheus/common/model"
2016-09-15 15:58:06 -07:00
"golang.org/x/net/context"
Improve persisting chunks to disk.
This is done by bucketing chunks by fingerprint. If the persisting to
disk falls behind, more and more chunks are in the queue. As soon as
there are "double hits", we will now persist both chunks in one go,
doubling the disk throughput (assuming it is limited by disk
seeks). Should even more pile up so that we end wit "triple hits", we
will persist those first, and so on.
Even if we have millions of time series, this will still help,
assuming not all of them are growing with the same speed. Series that
get many samples and/or are not very compressable will accumulate
chunks faster, and they will soon get double- or triple-writes.
To improve the chance of double writes,
-storage.local.persistence-queue-capacity could be set to a higher
value. However, that will slow down shutdown a lot (as the queue has
to be worked through). So we leave it to the user to set it to a
really high value. A more fundamental solution would be to checkpoint
not only head chunks, but also chunks still in the persist queue. That
would be quite complicated for a rather limited use-case (running many
time series with high ingestion rate on slow spinning disks).
2015-02-13 11:08:52 -08:00
2016-09-21 14:44:27 -07:00
"github.com/prometheus/prometheus/storage/local/chunk"
2014-08-14 09:23:49 -07:00
"github.com/prometheus/prometheus/storage/metric"
2015-05-29 04:30:30 -07:00
"github.com/prometheus/prometheus/util/testutil"
2014-06-06 02:55:53 -07:00
)
2015-06-15 09:25:31 -07:00
func TestMatches ( t * testing . T ) {
2016-03-20 15:32:20 -07:00
storage , closer := NewTestStorage ( t , 2 )
2015-02-27 05:41:43 -08:00
defer closer . Close ( )
2014-10-28 11:01:41 -07:00
2016-03-09 11:27:50 -08:00
storage . archiveHighWatermark = 90
2015-08-20 08:18:46 -07:00
samples := make ( [ ] * model . Sample , 100 )
fingerprints := make ( model . Fingerprints , 100 )
2015-02-27 05:41:43 -08:00
for i := range samples {
2015-08-20 08:18:46 -07:00
metric := model . Metric {
model . MetricNameLabel : model . LabelValue ( fmt . Sprintf ( "test_metric_%d" , i ) ) ,
"label1" : model . LabelValue ( fmt . Sprintf ( "test_%d" , i / 10 ) ) ,
"label2" : model . LabelValue ( fmt . Sprintf ( "test_%d" , ( i + 5 ) / 10 ) ) ,
"all" : "const" ,
2015-02-27 05:41:43 -08:00
}
2015-08-20 08:18:46 -07:00
samples [ i ] = & model . Sample {
2015-02-27 05:41:43 -08:00
Metric : metric ,
2015-08-20 08:18:46 -07:00
Timestamp : model . Time ( i ) ,
Value : model . SampleValue ( i ) ,
2015-02-27 05:41:43 -08:00
}
2015-05-05 09:17:51 -07:00
fingerprints [ i ] = metric . FastFingerprint ( )
2015-02-27 05:41:43 -08:00
}
2015-03-14 19:36:15 -07:00
for _ , s := range samples {
storage . Append ( s )
}
2015-02-27 05:41:43 -08:00
storage . WaitForIndexing ( )
2016-03-09 11:27:50 -08:00
// Archive every tenth metric.
for i , fp := range fingerprints {
if i % 10 != 0 {
continue
}
s , ok := storage . fpToSeries . get ( fp )
if ! ok {
t . Fatal ( "could not retrieve series for fp" , fp )
}
storage . fpLocker . Lock ( fp )
2016-09-26 04:06:06 -07:00
storage . persistence . archiveMetric ( fp , s . metric , s . firstTime ( ) , s . lastTime )
2016-03-09 11:27:50 -08:00
storage . fpLocker . Unlock ( fp )
}
2015-08-20 08:18:46 -07:00
newMatcher := func ( matchType metric . MatchType , name model . LabelName , value model . LabelValue ) * metric . LabelMatcher {
2015-02-27 05:41:43 -08:00
lm , err := metric . NewLabelMatcher ( matchType , name , value )
if err != nil {
t . Fatalf ( "error creating label matcher: %s" , err )
}
return lm
}
var matcherTests = [ ] struct {
matchers metric . LabelMatchers
2015-08-20 08:18:46 -07:00
expected model . Fingerprints
2015-02-27 05:41:43 -08:00
} {
{
matchers : metric . LabelMatchers { newMatcher ( metric . Equal , "label1" , "x" ) } ,
2015-08-20 08:18:46 -07:00
expected : model . Fingerprints { } ,
2015-02-27 05:41:43 -08:00
} ,
{
matchers : metric . LabelMatchers { newMatcher ( metric . Equal , "label1" , "test_0" ) } ,
expected : fingerprints [ : 10 ] ,
} ,
{
matchers : metric . LabelMatchers {
newMatcher ( metric . Equal , "label1" , "test_0" ) ,
newMatcher ( metric . Equal , "label2" , "test_1" ) ,
} ,
expected : fingerprints [ 5 : 10 ] ,
} ,
{
2015-06-15 09:25:31 -07:00
matchers : metric . LabelMatchers {
newMatcher ( metric . Equal , "all" , "const" ) ,
newMatcher ( metric . NotEqual , "label1" , "x" ) ,
} ,
2015-02-27 05:41:43 -08:00
expected : fingerprints ,
} ,
{
2015-06-15 09:25:31 -07:00
matchers : metric . LabelMatchers {
newMatcher ( metric . Equal , "all" , "const" ) ,
newMatcher ( metric . NotEqual , "label1" , "test_0" ) ,
} ,
2015-02-27 05:41:43 -08:00
expected : fingerprints [ 10 : ] ,
} ,
{
matchers : metric . LabelMatchers {
2015-06-15 09:25:31 -07:00
newMatcher ( metric . Equal , "all" , "const" ) ,
2015-02-27 05:41:43 -08:00
newMatcher ( metric . NotEqual , "label1" , "test_0" ) ,
newMatcher ( metric . NotEqual , "label1" , "test_1" ) ,
newMatcher ( metric . NotEqual , "label1" , "test_2" ) ,
} ,
expected : fingerprints [ 30 : ] ,
} ,
{
2015-06-15 09:25:31 -07:00
matchers : metric . LabelMatchers {
newMatcher ( metric . Equal , "label1" , "" ) ,
} ,
expected : fingerprints [ : 0 ] ,
} ,
{
matchers : metric . LabelMatchers {
newMatcher ( metric . NotEqual , "label1" , "test_0" ) ,
newMatcher ( metric . Equal , "label1" , "" ) ,
} ,
expected : fingerprints [ : 0 ] ,
} ,
{
matchers : metric . LabelMatchers {
newMatcher ( metric . NotEqual , "label1" , "test_0" ) ,
newMatcher ( metric . Equal , "label2" , "" ) ,
} ,
expected : fingerprints [ : 0 ] ,
} ,
{
matchers : metric . LabelMatchers {
newMatcher ( metric . Equal , "all" , "const" ) ,
newMatcher ( metric . NotEqual , "label1" , "test_0" ) ,
2016-09-14 20:23:28 -07:00
newMatcher ( metric . Equal , "not_existent" , "" ) ,
2015-06-15 09:25:31 -07:00
} ,
expected : fingerprints [ 10 : ] ,
} ,
{
matchers : metric . LabelMatchers {
newMatcher ( metric . RegexMatch , "label1" , ` test_[3-5] ` ) ,
} ,
2015-02-27 05:41:43 -08:00
expected : fingerprints [ 30 : 60 ] ,
} ,
{
2015-06-15 09:25:31 -07:00
matchers : metric . LabelMatchers {
newMatcher ( metric . Equal , "all" , "const" ) ,
newMatcher ( metric . RegexNoMatch , "label1" , ` test_[3-5] ` ) ,
} ,
2015-08-20 08:18:46 -07:00
expected : append ( append ( model . Fingerprints { } , fingerprints [ : 30 ] ... ) , fingerprints [ 60 : ] ... ) ,
2015-02-27 05:41:43 -08:00
} ,
{
matchers : metric . LabelMatchers {
newMatcher ( metric . RegexMatch , "label1" , ` test_[3-5] ` ) ,
newMatcher ( metric . RegexMatch , "label2" , ` test_[4-6] ` ) ,
} ,
expected : fingerprints [ 35 : 60 ] ,
} ,
{
matchers : metric . LabelMatchers {
newMatcher ( metric . RegexMatch , "label1" , ` test_[3-5] ` ) ,
newMatcher ( metric . NotEqual , "label2" , ` test_4 ` ) ,
} ,
2015-08-20 08:18:46 -07:00
expected : append ( append ( model . Fingerprints { } , fingerprints [ 30 : 35 ] ... ) , fingerprints [ 45 : 60 ] ... ) ,
2015-02-27 05:41:43 -08:00
} ,
2015-07-23 13:46:13 -07:00
{
matchers : metric . LabelMatchers {
newMatcher ( metric . Equal , "label1" , ` nonexistent ` ) ,
newMatcher ( metric . RegexMatch , "label2" , ` test ` ) ,
} ,
2015-08-20 08:18:46 -07:00
expected : model . Fingerprints { } ,
2015-07-23 13:46:13 -07:00
} ,
{
matchers : metric . LabelMatchers {
newMatcher ( metric . Equal , "label1" , ` test_0 ` ) ,
newMatcher ( metric . RegexMatch , "label2" , ` nonexistent ` ) ,
} ,
2015-08-20 08:18:46 -07:00
expected : model . Fingerprints { } ,
2015-07-23 13:46:13 -07:00
} ,
2015-02-27 05:41:43 -08:00
}
for _ , mt := range matcherTests {
2016-07-11 11:27:25 -07:00
metrics , err := storage . MetricsForLabelMatchers (
2016-09-15 15:58:06 -07:00
context . Background ( ) ,
2016-03-08 15:09:42 -08:00
model . Earliest , model . Latest ,
2016-07-11 11:27:25 -07:00
mt . matchers ,
2016-03-08 15:09:42 -08:00
)
2016-07-11 11:27:25 -07:00
if err != nil {
t . Fatal ( err )
}
if len ( mt . expected ) != len ( metrics ) {
t . Fatalf ( "expected %d matches for %q, found %d" , len ( mt . expected ) , mt . matchers , len ( metrics ) )
2015-02-27 05:41:43 -08:00
}
2016-07-11 11:27:25 -07:00
for _ , m := range metrics {
fp1 := m . Metric . FastFingerprint ( )
2015-02-27 05:41:43 -08:00
found := false
for _ , fp2 := range mt . expected {
if fp1 == fp2 {
found = true
break
}
}
if ! found {
t . Errorf ( "expected fingerprint %s for %q not in result" , fp1 , mt . matchers )
}
}
2016-03-09 11:27:50 -08:00
// Smoketest for from/through.
2016-07-11 11:27:25 -07:00
metrics , err = storage . MetricsForLabelMatchers (
2016-09-15 15:58:06 -07:00
context . Background ( ) ,
2016-03-09 11:27:50 -08:00
model . Earliest , - 10000 ,
2016-07-11 11:27:25 -07:00
mt . matchers ,
)
if err != nil {
t . Fatal ( err )
}
if len ( metrics ) > 0 {
2016-03-09 11:27:50 -08:00
t . Error ( "expected no matches with 'through' older than any sample" )
}
2016-07-11 11:27:25 -07:00
metrics , err = storage . MetricsForLabelMatchers (
2016-09-15 15:58:06 -07:00
context . Background ( ) ,
2016-03-09 11:27:50 -08:00
10000 , model . Latest ,
2016-07-11 11:27:25 -07:00
mt . matchers ,
)
if err != nil {
t . Fatal ( err )
}
if len ( metrics ) > 0 {
2016-03-09 11:27:50 -08:00
t . Error ( "expected no matches with 'from' newer than any sample" )
}
// Now the tricky one, cut out something from the middle.
var (
from model . Time = 25
through model . Time = 75
)
2016-07-11 11:27:25 -07:00
metrics , err = storage . MetricsForLabelMatchers (
2016-09-15 15:58:06 -07:00
context . Background ( ) ,
2016-03-09 11:27:50 -08:00
from , through ,
2016-07-11 11:27:25 -07:00
mt . matchers ,
2016-03-09 11:27:50 -08:00
)
2016-07-11 11:27:25 -07:00
if err != nil {
t . Fatal ( err )
}
2016-03-09 11:27:50 -08:00
expected := model . Fingerprints { }
for _ , fp := range mt . expected {
i := 0
for ; fingerprints [ i ] != fp && i < len ( fingerprints ) ; i ++ {
}
if i == len ( fingerprints ) {
t . Fatal ( "expected fingerprint does not exist" )
}
if ! model . Time ( i ) . Before ( from ) && ! model . Time ( i ) . After ( through ) {
expected = append ( expected , fp )
}
}
2016-07-11 11:27:25 -07:00
if len ( expected ) != len ( metrics ) {
t . Errorf ( "expected %d range-limited matches for %q, found %d" , len ( expected ) , mt . matchers , len ( metrics ) )
2016-03-09 11:27:50 -08:00
}
2016-07-11 11:27:25 -07:00
for _ , m := range metrics {
fp1 := m . Metric . FastFingerprint ( )
2016-03-09 11:27:50 -08:00
found := false
for _ , fp2 := range expected {
if fp1 == fp2 {
found = true
break
}
}
if ! found {
t . Errorf ( "expected fingerprint %s for %q not in range-limited result" , fp1 , mt . matchers )
}
}
2015-02-27 05:41:43 -08:00
}
2014-10-28 11:01:41 -07:00
}
2015-06-15 09:25:31 -07:00
func TestFingerprintsForLabels ( t * testing . T ) {
2016-03-20 15:32:20 -07:00
storage , closer := NewTestStorage ( t , 2 )
2015-06-15 09:25:31 -07:00
defer closer . Close ( )
2015-08-20 08:18:46 -07:00
samples := make ( [ ] * model . Sample , 100 )
fingerprints := make ( model . Fingerprints , 100 )
2015-06-15 09:25:31 -07:00
for i := range samples {
2015-08-20 08:18:46 -07:00
metric := model . Metric {
model . MetricNameLabel : model . LabelValue ( fmt . Sprintf ( "test_metric_%d" , i ) ) ,
"label1" : model . LabelValue ( fmt . Sprintf ( "test_%d" , i / 10 ) ) ,
"label2" : model . LabelValue ( fmt . Sprintf ( "test_%d" , ( i + 5 ) / 10 ) ) ,
2015-06-15 09:25:31 -07:00
}
2015-08-20 08:18:46 -07:00
samples [ i ] = & model . Sample {
2015-06-15 09:25:31 -07:00
Metric : metric ,
2015-08-20 08:18:46 -07:00
Timestamp : model . Time ( i ) ,
Value : model . SampleValue ( i ) ,
2015-06-15 09:25:31 -07:00
}
fingerprints [ i ] = metric . FastFingerprint ( )
}
for _ , s := range samples {
storage . Append ( s )
}
storage . WaitForIndexing ( )
var matcherTests = [ ] struct {
2015-08-22 04:32:13 -07:00
pairs [ ] model . LabelPair
2015-08-20 08:18:46 -07:00
expected model . Fingerprints
2015-06-15 09:25:31 -07:00
} {
{
2016-12-13 07:57:49 -08:00
pairs : [ ] model . LabelPair { { Name : "label1" , Value : "x" } } ,
2015-06-15 09:25:31 -07:00
expected : fingerprints [ : 0 ] ,
} ,
{
2016-12-13 07:57:49 -08:00
pairs : [ ] model . LabelPair { { Name : "label1" , Value : "test_0" } } ,
2015-06-15 09:25:31 -07:00
expected : fingerprints [ : 10 ] ,
} ,
{
2015-08-22 04:32:13 -07:00
pairs : [ ] model . LabelPair {
2016-12-13 07:57:49 -08:00
{ Name : "label1" , Value : "test_0" } ,
{ Name : "label1" , Value : "test_1" } ,
2015-06-15 09:25:31 -07:00
} ,
expected : fingerprints [ : 0 ] ,
} ,
{
2015-08-22 04:32:13 -07:00
pairs : [ ] model . LabelPair {
2016-12-13 07:57:49 -08:00
{ Name : "label1" , Value : "test_0" } ,
{ Name : "label2" , Value : "test_1" } ,
2015-06-15 09:25:31 -07:00
} ,
expected : fingerprints [ 5 : 10 ] ,
} ,
{
2015-08-22 04:32:13 -07:00
pairs : [ ] model . LabelPair {
2016-12-13 07:57:49 -08:00
{ Name : "label1" , Value : "test_1" } ,
{ Name : "label2" , Value : "test_2" } ,
2015-06-15 09:25:31 -07:00
} ,
expected : fingerprints [ 15 : 20 ] ,
} ,
}
for _ , mt := range matcherTests {
storage: improve index lookups
tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.
In more detail:
Imagine the following query:
nicely:aggregating:rule{job="foo",env="prod"}
While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:
nicely:aggregating:rule{job=~"foo",env=~"prod"}
Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.
This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:
- First, sort all matchers by "expected cardinality". Matchers
matching the empty string are always worst (and never used for
intersections). Equal matchers are in general consider best, but by
using some crude heuristics, we declare some better than others
(instance labels or anything that looks like a recording rule).
- Then go through the matchers until we hit a threshold of remaining
FPs in the intersection. This threshold is higher if we are already
in the non-Equal matcher area as intersection is even more expensive
here.
- Once the threshold has been reached (or we have run out of matchers
that do not match the empty string), start with "retrieve metrics
and check them individually against remaining matchers".
A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).
2016-06-28 11:18:32 -07:00
var resfps map [ model . Fingerprint ] struct { }
for _ , pair := range mt . pairs {
resfps = storage . fingerprintsForLabelPair ( pair , nil , resfps )
}
2015-06-15 09:25:31 -07:00
if len ( mt . expected ) != len ( resfps ) {
t . Fatalf ( "expected %d matches for %q, found %d" , len ( mt . expected ) , mt . pairs , len ( resfps ) )
}
for fp1 := range resfps {
found := false
for _ , fp2 := range mt . expected {
if fp1 == fp2 {
found = true
break
}
}
if ! found {
t . Errorf ( "expected fingerprint %s for %q not in result" , fp1 , mt . pairs )
}
}
}
}
2016-07-11 11:27:25 -07:00
var benchLabelMatchingRes [ ] metric . Metric
2015-06-22 03:02:03 -07:00
func BenchmarkLabelMatching ( b * testing . B ) {
2016-03-20 15:32:20 -07:00
s , closer := NewTestStorage ( b , 2 )
2015-06-22 03:02:03 -07:00
defer closer . Close ( )
h := fnv . New64a ( )
2015-08-20 08:18:46 -07:00
lbl := func ( x int ) model . LabelValue {
2015-06-22 03:02:03 -07:00
h . Reset ( )
h . Write ( [ ] byte ( fmt . Sprintf ( "%d" , x ) ) )
2015-08-20 08:18:46 -07:00
return model . LabelValue ( fmt . Sprintf ( "%d" , h . Sum64 ( ) ) )
2015-06-22 03:02:03 -07:00
}
M := 32
2015-08-20 08:18:46 -07:00
met := model . Metric { }
2015-06-22 03:02:03 -07:00
for i := 0 ; i < M ; i ++ {
met [ "label_a" ] = lbl ( i )
for j := 0 ; j < M ; j ++ {
met [ "label_b" ] = lbl ( j )
for k := 0 ; k < M ; k ++ {
met [ "label_c" ] = lbl ( k )
for l := 0 ; l < M ; l ++ {
met [ "label_d" ] = lbl ( l )
2015-08-20 08:18:46 -07:00
s . Append ( & model . Sample {
2015-06-22 03:02:03 -07:00
Metric : met . Clone ( ) ,
Timestamp : 0 ,
Value : 1 ,
} )
}
}
}
}
s . WaitForIndexing ( )
2015-08-20 08:18:46 -07:00
newMatcher := func ( matchType metric . MatchType , name model . LabelName , value model . LabelValue ) * metric . LabelMatcher {
2015-06-22 03:02:03 -07:00
lm , err := metric . NewLabelMatcher ( matchType , name , value )
if err != nil {
b . Fatalf ( "error creating label matcher: %s" , err )
}
return lm
}
var matcherTests = [ ] metric . LabelMatchers {
{
newMatcher ( metric . Equal , "label_a" , lbl ( 1 ) ) ,
} ,
{
newMatcher ( metric . Equal , "label_a" , lbl ( 3 ) ) ,
newMatcher ( metric . Equal , "label_c" , lbl ( 3 ) ) ,
} ,
{
newMatcher ( metric . Equal , "label_a" , lbl ( 3 ) ) ,
newMatcher ( metric . Equal , "label_c" , lbl ( 3 ) ) ,
newMatcher ( metric . NotEqual , "label_d" , lbl ( 3 ) ) ,
} ,
{
newMatcher ( metric . Equal , "label_a" , lbl ( 3 ) ) ,
newMatcher ( metric . Equal , "label_b" , lbl ( 3 ) ) ,
newMatcher ( metric . Equal , "label_c" , lbl ( 3 ) ) ,
newMatcher ( metric . NotEqual , "label_d" , lbl ( 3 ) ) ,
} ,
{
newMatcher ( metric . RegexMatch , "label_a" , ".+" ) ,
} ,
{
newMatcher ( metric . Equal , "label_a" , lbl ( 3 ) ) ,
newMatcher ( metric . RegexMatch , "label_a" , ".+" ) ,
} ,
{
newMatcher ( metric . Equal , "label_a" , lbl ( 1 ) ) ,
newMatcher ( metric . RegexMatch , "label_c" , "(" + lbl ( 3 ) + "|" + lbl ( 10 ) + ")" ) ,
} ,
{
newMatcher ( metric . Equal , "label_a" , lbl ( 3 ) ) ,
newMatcher ( metric . Equal , "label_a" , lbl ( 4 ) ) ,
newMatcher ( metric . RegexMatch , "label_c" , "(" + lbl ( 3 ) + "|" + lbl ( 10 ) + ")" ) ,
} ,
}
b . ReportAllocs ( )
b . ResetTimer ( )
2016-07-11 11:27:25 -07:00
var err error
2015-06-22 03:02:03 -07:00
for i := 0 ; i < b . N ; i ++ {
2016-07-11 11:27:25 -07:00
benchLabelMatchingRes = [ ] metric . Metric { }
2015-06-22 03:02:03 -07:00
for _ , mt := range matcherTests {
2016-07-11 11:27:25 -07:00
benchLabelMatchingRes , err = s . MetricsForLabelMatchers (
2016-09-15 15:58:06 -07:00
context . Background ( ) ,
2016-03-08 15:09:42 -08:00
model . Earliest , model . Latest ,
2016-07-11 11:27:25 -07:00
mt ,
2016-03-08 15:09:42 -08:00
)
2016-07-11 11:27:25 -07:00
if err != nil {
b . Fatal ( err )
}
2015-06-22 03:02:03 -07:00
}
}
// Stop timer to not count the storage closing.
b . StopTimer ( )
}
2016-09-18 14:58:39 -07:00
func BenchmarkQueryRange ( b * testing . B ) {
now := model . Now ( )
insertStart := now . Add ( - 2 * time . Hour )
s , closer := NewTestStorage ( b , 2 )
defer closer . Close ( )
// Stop maintenance loop to prevent actual purging.
close ( s . loopStopping )
<- s . loopStopped
<- s . logThrottlingStopped
// Recreate channel to avoid panic when we really shut down.
s . loopStopping = make ( chan struct { } )
for i := 0 ; i < 8192 ; i ++ {
s . Append ( & model . Sample {
Metric : model . Metric { "__name__" : model . LabelValue ( strconv . Itoa ( i ) ) , "job" : "test" } ,
Timestamp : insertStart ,
Value : 1 ,
} )
}
s . WaitForIndexing ( )
b . ResetTimer ( )
b . RunParallel ( func ( pb * testing . PB ) {
lm , _ := metric . NewLabelMatcher ( metric . Equal , "job" , "test" )
for pb . Next ( ) {
s . QueryRange ( context . Background ( ) , insertStart , now , lm )
}
} )
}
2016-11-01 07:05:01 -07:00
func TestQueryRangeThroughBeforeFrom ( t * testing . T ) {
now := model . Now ( )
insertStart := now . Add ( - 2 * time . Hour )
s , closer := NewTestStorage ( t , 2 )
defer closer . Close ( )
// Stop maintenance loop to prevent actual purging.
close ( s . loopStopping )
<- s . loopStopped
<- s . logThrottlingStopped
// Recreate channel to avoid panic when we really shut down.
s . loopStopping = make ( chan struct { } )
for i := 0 ; i < 8192 ; i ++ {
s . Append ( & model . Sample {
Metric : model . Metric { "__name__" : "testmetric" , "job" : "test" } ,
Timestamp : insertStart . Add ( time . Duration ( i ) * time . Second ) ,
Value : model . SampleValue ( rand . Float64 ( ) ) ,
} )
}
s . WaitForIndexing ( )
lm , _ := metric . NewLabelMatcher ( metric . Equal , "job" , "test" )
iters , err := s . QueryRange ( context . Background ( ) , now . Add ( - 30 * time . Minute ) , now . Add ( - 90 * time . Minute ) , lm )
if err != nil {
t . Error ( err )
}
if len ( iters ) != 0 {
t . Errorf ( "expected no iters to be returned, got %d" , len ( iters ) )
}
}
2015-05-27 02:24:56 -07:00
func TestRetentionCutoff ( t * testing . T ) {
2015-08-20 08:18:46 -07:00
now := model . Now ( )
2015-05-27 02:24:56 -07:00
insertStart := now . Add ( - 2 * time . Hour )
2016-03-20 15:32:20 -07:00
s , closer := NewTestStorage ( t , 2 )
2015-05-27 02:24:56 -07:00
defer closer . Close ( )
// Stop maintenance loop to prevent actual purging.
2016-01-25 07:36:14 -08:00
close ( s . loopStopping )
<- s . loopStopped
2016-02-11 16:46:18 -08:00
<- s . logThrottlingStopped
2016-01-25 07:36:14 -08:00
// Recreate channel to avoid panic when we really shut down.
s . loopStopping = make ( chan struct { } )
2015-05-27 02:24:56 -07:00
s . dropAfter = 1 * time . Hour
2015-05-27 08:41:57 -07:00
for i := 0 ; i < 120 ; i ++ {
2015-08-20 08:18:46 -07:00
smpl := & model . Sample {
Metric : model . Metric { "job" : "test" } ,
2015-05-27 02:24:56 -07:00
Timestamp : insertStart . Add ( time . Duration ( i ) * time . Minute ) , // 1 minute intervals.
Value : 1 ,
}
s . Append ( smpl )
}
s . WaitForIndexing ( )
2016-07-11 11:27:25 -07:00
lm , err := metric . NewLabelMatcher ( metric . Equal , "job" , "test" )
if err != nil {
t . Fatalf ( "error creating label matcher: %s" , err )
}
2016-09-15 15:58:06 -07:00
its , err := s . QueryRange ( context . Background ( ) , insertStart , now , lm )
2016-07-11 11:27:25 -07:00
if err != nil {
t . Fatal ( err )
2015-05-27 02:24:56 -07:00
}
2016-07-11 11:27:25 -07:00
if len ( its ) != 1 {
t . Fatalf ( "expected one iterator but got %d" , len ( its ) )
}
2015-05-27 02:24:56 -07:00
2016-07-11 11:27:25 -07:00
val := its [ 0 ] . ValueAtOrBeforeTime ( now . Add ( - 61 * time . Minute ) )
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
if val . Timestamp != model . Earliest {
2015-05-27 02:24:56 -07:00
t . Errorf ( "unexpected result for timestamp before retention period" )
}
2016-07-11 11:27:25 -07:00
vals := its [ 0 ] . RangeValues ( metric . Interval { OldestInclusive : insertStart , NewestInclusive : now } )
2015-08-20 08:18:46 -07:00
// We get 59 values here because the model.Now() is slightly later
2015-05-27 02:24:56 -07:00
// than our now.
if len ( vals ) != 59 {
t . Errorf ( "expected 59 values but got %d" , len ( vals ) )
}
if expt := now . Add ( - 1 * time . Hour ) . Add ( time . Minute ) ; vals [ 0 ] . Timestamp != expt {
t . Errorf ( "unexpected timestamp for first sample: %v, expected %v" , vals [ 0 ] . Timestamp . Time ( ) , expt . Time ( ) )
}
}
2015-05-27 08:41:57 -07:00
func TestDropMetrics ( t * testing . T ) {
2015-08-20 08:18:46 -07:00
now := model . Now ( )
2015-05-27 08:41:57 -07:00
insertStart := now . Add ( - 2 * time . Hour )
2016-03-20 15:32:20 -07:00
s , closer := NewTestStorage ( t , 2 )
2015-05-27 08:41:57 -07:00
defer closer . Close ( )
2015-09-11 06:47:23 -07:00
chunkFileExists := func ( fp model . Fingerprint ) ( bool , error ) {
f , err := s . persistence . openChunkFileForReading ( fp )
if err == nil {
f . Close ( )
return true , nil
}
if os . IsNotExist ( err ) {
return false , nil
}
return false , err
}
2015-08-20 08:18:46 -07:00
m1 := model . Metric { model . MetricNameLabel : "test" , "n1" : "v1" }
m2 := model . Metric { model . MetricNameLabel : "test" , "n1" : "v2" }
2015-09-11 06:47:23 -07:00
m3 := model . Metric { model . MetricNameLabel : "test" , "n1" : "v3" }
2015-05-27 08:41:57 -07:00
2016-07-11 11:27:25 -07:00
lm1 , err := metric . NewLabelMatcher ( metric . Equal , "n1" , "v1" )
if err != nil {
t . Fatal ( err )
}
lmAll , err := metric . NewLabelMatcher ( metric . Equal , model . MetricNameLabel , "test" )
if err != nil {
t . Fatal ( err )
}
2015-05-27 08:41:57 -07:00
N := 120000
2015-09-11 06:47:23 -07:00
for j , m := range [ ] model . Metric { m1 , m2 , m3 } {
2015-05-27 08:41:57 -07:00
for i := 0 ; i < N ; i ++ {
2015-08-20 08:18:46 -07:00
smpl := & model . Sample {
2015-05-27 08:41:57 -07:00
Metric : m ,
2015-09-11 06:47:23 -07:00
Timestamp : insertStart . Add ( time . Duration ( i ) * time . Millisecond ) , // 1 millisecond intervals.
2015-08-20 08:18:46 -07:00
Value : model . SampleValue ( j ) ,
2015-05-27 08:41:57 -07:00
}
s . Append ( smpl )
}
}
s . WaitForIndexing ( )
2015-09-11 06:47:23 -07:00
// Archive m3, but first maintain it so that at least something is written to disk.
fpToBeArchived := m3 . FastFingerprint ( )
s . maintainMemorySeries ( fpToBeArchived , 0 )
s . fpLocker . Lock ( fpToBeArchived )
s . fpToSeries . del ( fpToBeArchived )
2016-03-09 09:56:30 -08:00
s . persistence . archiveMetric ( fpToBeArchived , m3 , 0 , insertStart . Add ( time . Duration ( N - 1 ) * time . Millisecond ) )
2015-09-11 06:47:23 -07:00
s . fpLocker . Unlock ( fpToBeArchived )
2015-05-27 08:41:57 -07:00
storage: improve index lookups
tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.
In more detail:
Imagine the following query:
nicely:aggregating:rule{job="foo",env="prod"}
While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:
nicely:aggregating:rule{job=~"foo",env=~"prod"}
Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.
This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:
- First, sort all matchers by "expected cardinality". Matchers
matching the empty string are always worst (and never used for
intersections). Equal matchers are in general consider best, but by
using some crude heuristics, we declare some better than others
(instance labels or anything that looks like a recording rule).
- Then go through the matchers until we hit a threshold of remaining
FPs in the intersection. This threshold is higher if we are already
in the non-Equal matcher area as intersection is even more expensive
here.
- Once the threshold has been reached (or we have run out of matchers
that do not match the empty string), start with "retrieve metrics
and check them individually against remaining matchers".
A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).
2016-06-28 11:18:32 -07:00
fps := s . fingerprintsForLabelPair ( model . LabelPair {
Name : model . MetricNameLabel , Value : "test" ,
} , nil , nil )
2015-09-11 06:47:23 -07:00
if len ( fps ) != 3 {
t . Errorf ( "unexpected number of fingerprints: %d" , len ( fps ) )
2015-05-27 08:41:57 -07:00
}
2015-09-11 06:47:23 -07:00
fpList := model . Fingerprints { m1 . FastFingerprint ( ) , m2 . FastFingerprint ( ) , fpToBeArchived }
2016-09-15 15:58:06 -07:00
n , err := s . DropMetricsForLabelMatchers ( context . Background ( ) , lm1 )
2016-07-11 11:27:25 -07:00
if err != nil {
t . Fatal ( err )
}
if n != 1 {
t . Fatalf ( "expected 1 series to be dropped, got %d" , n )
}
2015-05-27 08:41:57 -07:00
s . WaitForIndexing ( )
storage: improve index lookups
tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.
In more detail:
Imagine the following query:
nicely:aggregating:rule{job="foo",env="prod"}
While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:
nicely:aggregating:rule{job=~"foo",env=~"prod"}
Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.
This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:
- First, sort all matchers by "expected cardinality". Matchers
matching the empty string are always worst (and never used for
intersections). Equal matchers are in general consider best, but by
using some crude heuristics, we declare some better than others
(instance labels or anything that looks like a recording rule).
- Then go through the matchers until we hit a threshold of remaining
FPs in the intersection. This threshold is higher if we are already
in the non-Equal matcher area as intersection is even more expensive
here.
- Once the threshold has been reached (or we have run out of matchers
that do not match the empty string), start with "retrieve metrics
and check them individually against remaining matchers".
A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).
2016-06-28 11:18:32 -07:00
fps2 := s . fingerprintsForLabelPair ( model . LabelPair {
2015-08-20 08:18:46 -07:00
Name : model . MetricNameLabel , Value : "test" ,
storage: improve index lookups
tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.
In more detail:
Imagine the following query:
nicely:aggregating:rule{job="foo",env="prod"}
While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:
nicely:aggregating:rule{job=~"foo",env=~"prod"}
Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.
This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:
- First, sort all matchers by "expected cardinality". Matchers
matching the empty string are always worst (and never used for
intersections). Equal matchers are in general consider best, but by
using some crude heuristics, we declare some better than others
(instance labels or anything that looks like a recording rule).
- Then go through the matchers until we hit a threshold of remaining
FPs in the intersection. This threshold is higher if we are already
in the non-Equal matcher area as intersection is even more expensive
here.
- Once the threshold has been reached (or we have run out of matchers
that do not match the empty string), start with "retrieve metrics
and check them individually against remaining matchers".
A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).
2016-06-28 11:18:32 -07:00
} , nil , nil )
2015-09-11 06:47:23 -07:00
if len ( fps2 ) != 2 {
t . Errorf ( "unexpected number of fingerprints: %d" , len ( fps2 ) )
2015-05-27 08:41:57 -07:00
}
2016-09-18 04:20:46 -07:00
it := s . preloadChunksForRange ( makeFingerprintSeriesPair ( s , fpList [ 0 ] ) , model . Earliest , model . Latest )
2015-07-13 12:12:27 -07:00
if vals := it . RangeValues ( metric . Interval { OldestInclusive : insertStart , NewestInclusive : now } ) ; len ( vals ) != 0 {
2015-09-11 06:47:23 -07:00
t . Errorf ( "unexpected number of samples: %d" , len ( vals ) )
2015-05-27 08:41:57 -07:00
}
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
2016-09-18 04:20:46 -07:00
it = s . preloadChunksForRange ( makeFingerprintSeriesPair ( s , fpList [ 1 ] ) , model . Earliest , model . Latest )
2015-07-13 12:12:27 -07:00
if vals := it . RangeValues ( metric . Interval { OldestInclusive : insertStart , NewestInclusive : now } ) ; len ( vals ) != N {
2015-09-11 06:47:23 -07:00
t . Errorf ( "unexpected number of samples: %d" , len ( vals ) )
}
exists , err := chunkFileExists ( fpList [ 2 ] )
if err != nil {
t . Fatal ( err )
}
if ! exists {
t . Errorf ( "chunk file does not exist for fp=%v" , fpList [ 2 ] )
2015-05-27 08:41:57 -07:00
}
2016-09-15 15:58:06 -07:00
n , err = s . DropMetricsForLabelMatchers ( context . Background ( ) , lmAll )
2016-07-11 11:27:25 -07:00
if err != nil {
t . Fatal ( err )
}
if n != 2 {
t . Fatalf ( "expected 2 series to be dropped, got %d" , n )
}
2015-05-27 08:41:57 -07:00
s . WaitForIndexing ( )
storage: improve index lookups
tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.
In more detail:
Imagine the following query:
nicely:aggregating:rule{job="foo",env="prod"}
While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:
nicely:aggregating:rule{job=~"foo",env=~"prod"}
Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.
This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:
- First, sort all matchers by "expected cardinality". Matchers
matching the empty string are always worst (and never used for
intersections). Equal matchers are in general consider best, but by
using some crude heuristics, we declare some better than others
(instance labels or anything that looks like a recording rule).
- Then go through the matchers until we hit a threshold of remaining
FPs in the intersection. This threshold is higher if we are already
in the non-Equal matcher area as intersection is even more expensive
here.
- Once the threshold has been reached (or we have run out of matchers
that do not match the empty string), start with "retrieve metrics
and check them individually against remaining matchers".
A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).
2016-06-28 11:18:32 -07:00
fps3 := s . fingerprintsForLabelPair ( model . LabelPair {
2015-08-20 08:18:46 -07:00
Name : model . MetricNameLabel , Value : "test" ,
storage: improve index lookups
tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.
In more detail:
Imagine the following query:
nicely:aggregating:rule{job="foo",env="prod"}
While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:
nicely:aggregating:rule{job=~"foo",env=~"prod"}
Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.
This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:
- First, sort all matchers by "expected cardinality". Matchers
matching the empty string are always worst (and never used for
intersections). Equal matchers are in general consider best, but by
using some crude heuristics, we declare some better than others
(instance labels or anything that looks like a recording rule).
- Then go through the matchers until we hit a threshold of remaining
FPs in the intersection. This threshold is higher if we are already
in the non-Equal matcher area as intersection is even more expensive
here.
- Once the threshold has been reached (or we have run out of matchers
that do not match the empty string), start with "retrieve metrics
and check them individually against remaining matchers".
A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).
2016-06-28 11:18:32 -07:00
} , nil , nil )
2015-05-27 08:41:57 -07:00
if len ( fps3 ) != 0 {
2015-09-11 06:47:23 -07:00
t . Errorf ( "unexpected number of fingerprints: %d" , len ( fps3 ) )
2015-05-27 08:41:57 -07:00
}
2016-09-18 04:20:46 -07:00
it = s . preloadChunksForRange ( makeFingerprintSeriesPair ( s , fpList [ 0 ] ) , model . Earliest , model . Latest )
2015-07-13 12:12:27 -07:00
if vals := it . RangeValues ( metric . Interval { OldestInclusive : insertStart , NewestInclusive : now } ) ; len ( vals ) != 0 {
2015-09-11 06:47:23 -07:00
t . Errorf ( "unexpected number of samples: %d" , len ( vals ) )
2015-05-27 08:41:57 -07:00
}
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
2016-09-18 04:20:46 -07:00
it = s . preloadChunksForRange ( makeFingerprintSeriesPair ( s , fpList [ 1 ] ) , model . Earliest , model . Latest )
2015-07-13 12:12:27 -07:00
if vals := it . RangeValues ( metric . Interval { OldestInclusive : insertStart , NewestInclusive : now } ) ; len ( vals ) != 0 {
2015-09-11 06:47:23 -07:00
t . Errorf ( "unexpected number of samples: %d" , len ( vals ) )
}
exists , err = chunkFileExists ( fpList [ 2 ] )
if err != nil {
t . Fatal ( err )
}
if exists {
t . Errorf ( "chunk file still exists for fp=%v" , fpList [ 2 ] )
2015-05-27 08:41:57 -07:00
}
}
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 03:23:42 -08:00
func TestQuarantineMetric ( t * testing . T ) {
now := model . Now ( )
insertStart := now . Add ( - 2 * time . Hour )
2016-03-20 15:32:20 -07:00
s , closer := NewTestStorage ( t , 2 )
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 03:23:42 -08:00
defer closer . Close ( )
chunkFileExists := func ( fp model . Fingerprint ) ( bool , error ) {
f , err := s . persistence . openChunkFileForReading ( fp )
if err == nil {
f . Close ( )
return true , nil
}
if os . IsNotExist ( err ) {
return false , nil
}
return false , err
}
m1 := model . Metric { model . MetricNameLabel : "test" , "n1" : "v1" }
m2 := model . Metric { model . MetricNameLabel : "test" , "n1" : "v2" }
m3 := model . Metric { model . MetricNameLabel : "test" , "n1" : "v3" }
N := 120000
for j , m := range [ ] model . Metric { m1 , m2 , m3 } {
for i := 0 ; i < N ; i ++ {
smpl := & model . Sample {
Metric : m ,
Timestamp : insertStart . Add ( time . Duration ( i ) * time . Millisecond ) , // 1 millisecond intervals.
Value : model . SampleValue ( j ) ,
}
s . Append ( smpl )
}
}
s . WaitForIndexing ( )
// Archive m3, but first maintain it so that at least something is written to disk.
fpToBeArchived := m3 . FastFingerprint ( )
s . maintainMemorySeries ( fpToBeArchived , 0 )
s . fpLocker . Lock ( fpToBeArchived )
s . fpToSeries . del ( fpToBeArchived )
2016-03-09 09:56:30 -08:00
s . persistence . archiveMetric ( fpToBeArchived , m3 , 0 , insertStart . Add ( time . Duration ( N - 1 ) * time . Millisecond ) )
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 03:23:42 -08:00
s . fpLocker . Unlock ( fpToBeArchived )
// Corrupt the series file for m3.
f , err := os . Create ( s . persistence . fileNameForFingerprint ( fpToBeArchived ) )
if err != nil {
t . Fatal ( err )
}
if _ , err := f . WriteString ( "This is clearly not the content of a series file." ) ; err != nil {
t . Fatal ( err )
}
if f . Close ( ) ; err != nil {
t . Fatal ( err )
}
storage: improve index lookups
tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.
In more detail:
Imagine the following query:
nicely:aggregating:rule{job="foo",env="prod"}
While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:
nicely:aggregating:rule{job=~"foo",env=~"prod"}
Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.
This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:
- First, sort all matchers by "expected cardinality". Matchers
matching the empty string are always worst (and never used for
intersections). Equal matchers are in general consider best, but by
using some crude heuristics, we declare some better than others
(instance labels or anything that looks like a recording rule).
- Then go through the matchers until we hit a threshold of remaining
FPs in the intersection. This threshold is higher if we are already
in the non-Equal matcher area as intersection is even more expensive
here.
- Once the threshold has been reached (or we have run out of matchers
that do not match the empty string), start with "retrieve metrics
and check them individually against remaining matchers".
A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).
2016-06-28 11:18:32 -07:00
fps := s . fingerprintsForLabelPair ( model . LabelPair {
Name : model . MetricNameLabel , Value : "test" ,
} , nil , nil )
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 03:23:42 -08:00
if len ( fps ) != 3 {
t . Errorf ( "unexpected number of fingerprints: %d" , len ( fps ) )
}
// This will access the corrupt file and lead to quarantining.
2016-09-18 04:20:46 -07:00
iter := s . preloadChunksForInstant ( makeFingerprintSeriesPair ( s , fpToBeArchived ) , now . Add ( - 2 * time . Hour - 1 * time . Minute ) , now . Add ( - 2 * time . Hour ) )
2016-07-11 11:27:25 -07:00
iter . Close ( )
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 03:23:42 -08:00
time . Sleep ( time . Second ) // Give time to quarantine. TODO(beorn7): Find a better way to wait.
s . WaitForIndexing ( )
storage: improve index lookups
tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.
In more detail:
Imagine the following query:
nicely:aggregating:rule{job="foo",env="prod"}
While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:
nicely:aggregating:rule{job=~"foo",env=~"prod"}
Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.
This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:
- First, sort all matchers by "expected cardinality". Matchers
matching the empty string are always worst (and never used for
intersections). Equal matchers are in general consider best, but by
using some crude heuristics, we declare some better than others
(instance labels or anything that looks like a recording rule).
- Then go through the matchers until we hit a threshold of remaining
FPs in the intersection. This threshold is higher if we are already
in the non-Equal matcher area as intersection is even more expensive
here.
- Once the threshold has been reached (or we have run out of matchers
that do not match the empty string), start with "retrieve metrics
and check them individually against remaining matchers".
A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).
2016-06-28 11:18:32 -07:00
fps2 := s . fingerprintsForLabelPair ( model . LabelPair {
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 03:23:42 -08:00
Name : model . MetricNameLabel , Value : "test" ,
storage: improve index lookups
tl;dr: This is not a fundamental solution to the indexing problem
(like tindex is) but it at least avoids utilizing the intersection
problem to the greatest possible amount.
In more detail:
Imagine the following query:
nicely:aggregating:rule{job="foo",env="prod"}
While it uses a nicely aggregating recording rule (which might have a
very low cardinality), Prometheus still intersects the low number of
fingerprints for `{__name__="nicely:aggregating:rule"}` with the many
thousands of fingerprints matching `{job="foo"}` and with the millions
of fingerprints matching `{env="prod"}`. This totally innocuous query
is dead slow if the Prometheus server has a lot of time series with
the `{env="prod"}` label. Ironically, if you make the query more
complicated, it becomes blazingly fast:
nicely:aggregating:rule{job=~"foo",env=~"prod"}
Why so? Because Prometheus only intersects with non-Equal matchers if
there are no Equal matchers. That's good in this case because it
retrieves the few fingerprints for
`{__name__="nicely:aggregating:rule"}` and then starts right ahead to
retrieve the metric for those FPs and checking individually if they
match the other matchers.
This change is generalizing the idea of when to stop intersecting FPs
and go into "retrieve metrics and check them individually against
remaining matchers" mode:
- First, sort all matchers by "expected cardinality". Matchers
matching the empty string are always worst (and never used for
intersections). Equal matchers are in general consider best, but by
using some crude heuristics, we declare some better than others
(instance labels or anything that looks like a recording rule).
- Then go through the matchers until we hit a threshold of remaining
FPs in the intersection. This threshold is higher if we are already
in the non-Equal matcher area as intersection is even more expensive
here.
- Once the threshold has been reached (or we have run out of matchers
that do not match the empty string), start with "retrieve metrics
and check them individually against remaining matchers".
A beefy server at SoundCloud was spending 67% of its CPU time in index
lookups (fingerprintsForLabelPairs), serving mostly a dashboard that
is exclusively built with recording rules. With this change, it spends
only 35% in fingerprintsForLabelPairs. The CPU usage dropped from 26
cores to 18 cores. The median latency for query_range dropped from 14s
to 50ms(!). As expected, higher percentile latency didn't improve that
much because the new approach is _occasionally_ running into the worst
case while the old one was _systematically_ doing so. The 99th
percentile latency is now about as high as the median before (14s)
while it was almost twice as high before (26s).
2016-06-28 11:18:32 -07:00
} , nil , nil )
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 03:23:42 -08:00
if len ( fps2 ) != 2 {
t . Errorf ( "unexpected number of fingerprints: %d" , len ( fps2 ) )
}
exists , err := chunkFileExists ( fpToBeArchived )
if err != nil {
t . Fatal ( err )
}
if exists {
t . Errorf ( "chunk file exists for fp=%v" , fpToBeArchived )
}
}
2014-10-28 11:01:41 -07:00
// TestLoop is just a smoke test for the loop method, if we can switch it on and
// off without disaster.
func TestLoop ( t * testing . T ) {
2015-02-26 06:19:44 -08:00
if testing . Short ( ) {
t . Skip ( "Skipping test in short mode." )
}
2015-08-20 08:18:46 -07:00
samples := make ( model . Samples , 1000 )
2014-10-28 11:01:41 -07:00
for i := range samples {
2015-08-20 08:18:46 -07:00
samples [ i ] = & model . Sample {
Timestamp : model . Time ( 2 * i ) ,
Value : model . SampleValue ( float64 ( i ) * 0.2 ) ,
2014-10-28 11:01:41 -07:00
}
}
2015-05-28 11:58:38 -07:00
directory := testutil . NewTemporaryDirectory ( "test_storage" , t )
2014-10-28 11:01:41 -07:00
defer directory . Close ( )
o := & MemorySeriesStorageOptions {
storage: Evict chunks and calculate persistence pressure based on target heap size
This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".
The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).
This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.
Answers to anticipated comments:
There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.
One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.
2017-03-01 06:17:31 -08:00
TargetHeapSize : 100000 ,
2014-10-28 11:01:41 -07:00
PersistenceRetentionPeriod : 24 * 7 * time . Hour ,
PersistenceStoragePath : directory . Path ( ) ,
storage: Use staleness delta as head chunk timeout
Currently, if a series stops to exist, its head chunk will be kept
open for an hour. That prevents it from being persisted. Which
prevents it from being evicted. Which prevents the series from being
archived.
Most of the time, once no sample has been added to a series within the
staleness limit, we can be pretty confident that this series will not
receive samples anymore. The whole chain as described above can be
started after 5m instead of 1h. In the relaxed case, this doesn't
change a lot as the head chunk timeout is only checked during series
maintenance, and usually, a series is only maintained every six
hours. However, there is the typical scenario where a large service is
deployed, the deoply turns out to be bad, and then it is deployed
again within minutes, and quite quickly the number of time series has
tripled. That's the point where the Prometheus server is stressed and
switches (rightfully) into rushed mode. In that mode, time series are
processed as quickly as possible, but all of that is in vein if all of
those recently ended time series cannot be persisted yet for another
hour. In that scenario, this change will help most, and it's exactly
the scenario where help is most desperately needed.
2017-03-26 14:44:50 -07:00
HeadChunkTimeout : 5 * time . Minute ,
2014-10-28 11:01:41 -07:00
CheckpointInterval : 250 * time . Millisecond ,
2015-03-19 07:41:50 -07:00
SyncStrategy : Adaptive ,
2016-01-11 07:42:10 -08:00
MinShrinkRatio : 0.1 ,
2014-10-28 11:01:41 -07:00
}
2015-05-18 10:26:28 -07:00
storage := NewMemorySeriesStorage ( o )
2015-05-20 07:12:07 -07:00
if err := storage . Start ( ) ; err != nil {
2015-09-11 06:47:23 -07:00
t . Errorf ( "Error starting storage: %s" , err )
2014-10-28 11:01:41 -07:00
}
2015-03-14 19:36:15 -07:00
for _ , s := range samples {
storage . Append ( s )
}
2015-02-26 06:19:44 -08:00
storage . WaitForIndexing ( )
2017-02-01 10:41:15 -08:00
fp := model . Metric { } . FastFingerprint ( )
series , _ := storage . fpToSeries . get ( fp )
storage . fpLocker . Lock ( fp )
2015-02-26 06:19:44 -08:00
cdsBefore := len ( series . chunkDescs )
2017-02-01 10:41:15 -08:00
storage . fpLocker . Unlock ( fp )
2015-02-26 06:19:44 -08:00
time . Sleep ( fpMaxWaitDuration + time . Second ) // TODO(beorn7): Ugh, need to wait for maintenance to kick in.
2017-02-01 10:41:15 -08:00
storage . fpLocker . Lock ( fp )
2015-02-26 06:19:44 -08:00
cdsAfter := len ( series . chunkDescs )
2017-02-01 10:41:15 -08:00
storage . fpLocker . Unlock ( fp )
2014-10-28 11:01:41 -07:00
storage . Stop ( )
2015-02-26 06:19:44 -08:00
if cdsBefore <= cdsAfter {
t . Errorf (
"Number of chunk descriptors should have gone down by now. Got before %d, after %d." ,
cdsBefore , cdsAfter ,
)
}
2014-10-28 11:01:41 -07:00
}
2016-09-21 14:44:27 -07:00
func testChunk ( t * testing . T , encoding chunk . Encoding ) {
2015-08-20 08:18:46 -07:00
samples := make ( model . Samples , 500000 )
2014-06-06 02:55:53 -07:00
for i := range samples {
2015-08-20 08:18:46 -07:00
samples [ i ] = & model . Sample {
Timestamp : model . Time ( i ) ,
Value : model . SampleValue ( float64 ( i ) * 0.2 ) ,
2014-06-06 02:55:53 -07:00
}
}
2015-03-13 07:49:07 -07:00
s , closer := NewTestStorage ( t , encoding )
2014-06-06 02:55:53 -07:00
defer closer . Close ( )
2015-03-14 19:36:15 -07:00
for _ , sample := range samples {
s . Append ( sample )
}
2015-02-12 08:23:42 -08:00
s . WaitForIndexing ( )
2014-06-06 02:55:53 -07:00
2015-05-06 07:53:12 -07:00
for m := range s . fpToSeries . iter ( ) {
s . fpLocker . Lock ( m . fp )
2015-08-22 05:52:35 -07:00
var values [ ] model . SamplePair
2014-11-13 11:50:25 -08:00
for _ , cd := range m . series . chunkDescs {
2016-09-21 14:44:27 -07:00
if cd . IsEvicted ( ) {
2014-11-13 11:50:25 -08:00
continue
}
2016-09-21 14:44:27 -07:00
it := cd . C . NewIterator ( )
2016-09-21 08:56:55 -07:00
for it . Scan ( ) {
values = append ( values , it . Value ( ) )
2016-03-07 10:50:13 -08:00
}
2016-09-21 08:56:55 -07:00
if it . Err ( ) != nil {
t . Error ( it . Err ( ) )
2014-11-13 11:50:25 -08:00
}
}
for i , v := range values {
2014-06-06 02:55:53 -07:00
if samples [ i ] . Timestamp != v . Timestamp {
2014-11-13 11:50:25 -08:00
t . Errorf ( "%d. Got %v; want %v" , i , v . Timestamp , samples [ i ] . Timestamp )
2014-06-06 02:55:53 -07:00
}
2015-03-06 07:03:03 -08:00
if samples [ i ] . Value != v . Value {
2014-11-13 11:50:25 -08:00
t . Errorf ( "%d. Got %v; want %v" , i , v . Value , samples [ i ] . Value )
2014-06-06 02:55:53 -07:00
}
}
storage: Evict chunks and calculate persistence pressure based on target heap size
This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".
The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).
This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.
Answers to anticipated comments:
There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.
One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.
2017-03-01 06:17:31 -08:00
s . fpLocker . Unlock ( m . fp )
2014-06-06 02:55:53 -07:00
}
2015-05-20 09:10:29 -07:00
log . Info ( "test done, closing" )
2014-06-06 02:55:53 -07:00
}
2015-03-04 04:40:18 -08:00
func TestChunkType0 ( t * testing . T ) {
testChunk ( t , 0 )
}
func TestChunkType1 ( t * testing . T ) {
testChunk ( t , 1 )
}
2016-03-12 12:34:51 -08:00
func TestChunkType2 ( t * testing . T ) {
testChunk ( t , 2 )
}
2016-09-21 14:44:27 -07:00
func testValueAtOrBeforeTime ( t * testing . T , encoding chunk . Encoding ) {
2015-08-20 08:18:46 -07:00
samples := make ( model . Samples , 10000 )
2014-06-06 02:55:53 -07:00
for i := range samples {
2015-08-20 08:18:46 -07:00
samples [ i ] = & model . Sample {
Timestamp : model . Time ( 2 * i ) ,
Value : model . SampleValue ( float64 ( i ) * 0.2 ) ,
2014-06-06 02:55:53 -07:00
}
}
2015-03-13 07:49:07 -07:00
s , closer := NewTestStorage ( t , encoding )
2014-06-06 02:55:53 -07:00
defer closer . Close ( )
2015-03-14 19:36:15 -07:00
for _ , sample := range samples {
s . Append ( sample )
}
2015-02-12 08:23:42 -08:00
s . WaitForIndexing ( )
2014-06-06 02:55:53 -07:00
2015-08-20 08:18:46 -07:00
fp := model . Metric { } . FastFingerprint ( )
2014-06-06 02:55:53 -07:00
2016-09-18 04:20:46 -07:00
it := s . preloadChunksForRange ( makeFingerprintSeriesPair ( s , fp ) , model . Earliest , model . Latest )
2014-06-06 02:55:53 -07:00
2014-08-14 09:23:49 -07:00
// #1 Exactly on a sample.
2014-06-06 02:55:53 -07:00
for i , expected := range samples {
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
actual := it . ValueAtOrBeforeTime ( expected . Timestamp )
2014-06-06 02:55:53 -07:00
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
if expected . Timestamp != actual . Timestamp {
t . Errorf ( "1.%d. Got %v; want %v" , i , actual . Timestamp , expected . Timestamp )
2014-06-06 02:55:53 -07:00
}
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
if expected . Value != actual . Value {
t . Errorf ( "1.%d. Got %v; want %v" , i , actual . Value , expected . Value )
2014-06-06 02:55:53 -07:00
}
}
2014-08-14 09:23:49 -07:00
// #2 Between samples.
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
for i , expected := range samples {
2014-08-14 09:23:49 -07:00
if i == len ( samples ) - 1 {
continue
}
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
actual := it . ValueAtOrBeforeTime ( expected . Timestamp + 1 )
2014-08-14 09:23:49 -07:00
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
if expected . Timestamp != actual . Timestamp {
t . Errorf ( "2.%d. Got %v; want %v" , i , actual . Timestamp , expected . Timestamp )
2014-08-14 09:23:49 -07:00
}
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
if expected . Value != actual . Value {
t . Errorf ( "2.%d. Got %v; want %v" , i , actual . Value , expected . Value )
2014-08-14 09:23:49 -07:00
}
}
// #3 Corner cases: Just before the first sample, just after the last.
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
expected := & model . Sample { Timestamp : model . Earliest }
actual := it . ValueAtOrBeforeTime ( samples [ 0 ] . Timestamp - 1 )
if expected . Timestamp != actual . Timestamp {
t . Errorf ( "3.1. Got %v; want %v" , actual . Timestamp , expected . Timestamp )
2014-08-14 09:23:49 -07:00
}
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
if expected . Value != actual . Value {
t . Errorf ( "3.1. Got %v; want %v" , actual . Value , expected . Value )
2014-08-14 09:23:49 -07:00
}
expected = samples [ len ( samples ) - 1 ]
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
actual = it . ValueAtOrBeforeTime ( expected . Timestamp + 1 )
if expected . Timestamp != actual . Timestamp {
t . Errorf ( "3.2. Got %v; want %v" , actual . Timestamp , expected . Timestamp )
2014-08-14 09:23:49 -07:00
}
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
if expected . Value != actual . Value {
t . Errorf ( "3.2. Got %v; want %v" , actual . Value , expected . Value )
2014-08-14 09:23:49 -07:00
}
2017-07-21 07:25:24 -07:00
// #4 Query alternatingly exactly on and just between timestamps.
// Exposes issue #2965.
for i , expected := range samples {
i *= 2
actual := it . ValueAtOrBeforeTime ( expected . Timestamp )
if expected . Timestamp != actual . Timestamp {
t . Errorf ( "4.%d. Got %v; want %v" , i , actual . Timestamp , expected . Timestamp )
}
if expected . Value != actual . Value {
t . Errorf ( "4.%d. Got %v; want %v" , i , actual . Value , expected . Value )
}
i ++
actual = it . ValueAtOrBeforeTime ( expected . Timestamp + 1 )
if expected . Timestamp != actual . Timestamp {
t . Errorf ( "4.%d. Got %v; want %v" , i , actual . Timestamp , expected . Timestamp )
}
if expected . Value != actual . Value {
t . Errorf ( "4.%d. Got %v; want %v" , i , actual . Value , expected . Value )
}
}
2014-06-06 02:55:53 -07:00
}
2015-05-20 10:13:06 -07:00
func TestValueAtTimeChunkType0 ( t * testing . T ) {
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
testValueAtOrBeforeTime ( t , 0 )
2015-03-04 04:40:18 -08:00
}
2015-05-20 10:13:06 -07:00
func TestValueAtTimeChunkType1 ( t * testing . T ) {
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
testValueAtOrBeforeTime ( t , 1 )
2015-03-04 04:40:18 -08:00
}
2016-03-12 12:34:51 -08:00
func TestValueAtTimeChunkType2 ( t * testing . T ) {
testValueAtOrBeforeTime ( t , 2 )
}
2016-09-21 14:44:27 -07:00
func benchmarkValueAtOrBeforeTime ( b * testing . B , encoding chunk . Encoding ) {
2015-08-20 08:18:46 -07:00
samples := make ( model . Samples , 10000 )
2015-05-19 10:12:01 -07:00
for i := range samples {
2015-08-20 08:18:46 -07:00
samples [ i ] = & model . Sample {
Timestamp : model . Time ( 2 * i ) ,
Value : model . SampleValue ( float64 ( i ) * 0.2 ) ,
2015-05-19 10:12:01 -07:00
}
}
s , closer := NewTestStorage ( b , encoding )
defer closer . Close ( )
for _ , sample := range samples {
s . Append ( sample )
}
s . WaitForIndexing ( )
2015-08-20 08:18:46 -07:00
fp := model . Metric { } . FastFingerprint ( )
2015-05-19 10:12:01 -07:00
2016-09-18 04:20:46 -07:00
it := s . preloadChunksForRange ( makeFingerprintSeriesPair ( s , fp ) , model . Earliest , model . Latest )
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
2015-05-19 10:12:01 -07:00
b . ResetTimer ( )
for i := 0 ; i < b . N ; i ++ {
// #1 Exactly on a sample.
for i , expected := range samples {
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
actual := it . ValueAtOrBeforeTime ( expected . Timestamp )
2015-05-19 10:12:01 -07:00
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
if expected . Timestamp != actual . Timestamp {
b . Errorf ( "1.%d. Got %v; want %v" , i , actual . Timestamp , expected . Timestamp )
2015-05-19 10:12:01 -07:00
}
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
if expected . Value != actual . Value {
b . Errorf ( "1.%d. Got %v; want %v" , i , actual . Value , expected . Value )
2015-05-19 10:12:01 -07:00
}
}
// #2 Between samples.
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
for i , expected := range samples {
2015-05-19 10:12:01 -07:00
if i == len ( samples ) - 1 {
continue
}
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
actual := it . ValueAtOrBeforeTime ( expected . Timestamp + 1 )
2015-05-19 10:12:01 -07:00
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
if expected . Timestamp != actual . Timestamp {
b . Errorf ( "2.%d. Got %v; want %v" , i , actual . Timestamp , expected . Timestamp )
2015-05-19 10:12:01 -07:00
}
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
if expected . Value != actual . Value {
b . Errorf ( "2.%d. Got %v; want %v" , i , actual . Value , expected . Value )
2015-05-19 10:12:01 -07:00
}
}
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
// #3 Corner cases: Just before the first sample, just after the last.
expected := & model . Sample { Timestamp : model . Earliest }
actual := it . ValueAtOrBeforeTime ( samples [ 0 ] . Timestamp - 1 )
if expected . Timestamp != actual . Timestamp {
b . Errorf ( "3.1. Got %v; want %v" , actual . Timestamp , expected . Timestamp )
}
if expected . Value != actual . Value {
b . Errorf ( "3.1. Got %v; want %v" , actual . Value , expected . Value )
}
expected = samples [ len ( samples ) - 1 ]
actual = it . ValueAtOrBeforeTime ( expected . Timestamp + 1 )
if expected . Timestamp != actual . Timestamp {
b . Errorf ( "3.2. Got %v; want %v" , actual . Timestamp , expected . Timestamp )
}
if expected . Value != actual . Value {
b . Errorf ( "3.2. Got %v; want %v" , actual . Value , expected . Value )
}
2015-05-19 10:12:01 -07:00
}
}
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
func BenchmarkValueAtOrBeforeTimeChunkType0 ( b * testing . B ) {
benchmarkValueAtOrBeforeTime ( b , 0 )
2015-05-19 10:12:01 -07:00
}
2015-05-20 10:13:06 -07:00
func BenchmarkValueAtTimeChunkType1 ( b * testing . B ) {
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
benchmarkValueAtOrBeforeTime ( b , 1 )
2015-05-19 10:12:01 -07:00
}
2016-03-12 12:34:51 -08:00
func BenchmarkValueAtTimeChunkType2 ( b * testing . B ) {
benchmarkValueAtOrBeforeTime ( b , 2 )
}
2016-09-21 14:44:27 -07:00
func testRangeValues ( t * testing . T , encoding chunk . Encoding ) {
2015-08-20 08:18:46 -07:00
samples := make ( model . Samples , 10000 )
2014-06-06 02:55:53 -07:00
for i := range samples {
2015-08-20 08:18:46 -07:00
samples [ i ] = & model . Sample {
Timestamp : model . Time ( 2 * i ) ,
Value : model . SampleValue ( float64 ( i ) * 0.2 ) ,
2014-06-06 02:55:53 -07:00
}
}
2015-03-13 07:49:07 -07:00
s , closer := NewTestStorage ( t , encoding )
2014-06-06 02:55:53 -07:00
defer closer . Close ( )
2015-03-14 19:36:15 -07:00
for _ , sample := range samples {
s . Append ( sample )
}
2015-02-12 08:23:42 -08:00
s . WaitForIndexing ( )
2014-06-06 02:55:53 -07:00
2015-08-20 08:18:46 -07:00
fp := model . Metric { } . FastFingerprint ( )
2014-06-06 02:55:53 -07:00
2016-09-18 04:20:46 -07:00
it := s . preloadChunksForRange ( makeFingerprintSeriesPair ( s , fp ) , model . Earliest , model . Latest )
2014-06-06 02:55:53 -07:00
2014-08-14 09:23:49 -07:00
// #1 Zero length interval at sample.
2014-06-06 02:55:53 -07:00
for i , expected := range samples {
2015-05-20 10:13:06 -07:00
actual := it . RangeValues ( metric . Interval {
2014-08-14 09:23:49 -07:00
OldestInclusive : expected . Timestamp ,
NewestInclusive : expected . Timestamp ,
} )
2014-06-06 02:55:53 -07:00
2014-08-14 09:23:49 -07:00
if len ( actual ) != 1 {
t . Fatalf ( "1.%d. Expected exactly one result, got %d." , i , len ( actual ) )
}
2014-06-06 02:55:53 -07:00
if expected . Timestamp != actual [ 0 ] . Timestamp {
2014-08-14 09:23:49 -07:00
t . Errorf ( "1.%d. Got %v; want %v." , i , actual [ 0 ] . Timestamp , expected . Timestamp )
2014-06-06 02:55:53 -07:00
}
if expected . Value != actual [ 0 ] . Value {
2014-08-14 09:23:49 -07:00
t . Errorf ( "1.%d. Got %v; want %v." , i , actual [ 0 ] . Value , expected . Value )
}
}
// #2 Zero length interval off sample.
for i , expected := range samples {
2015-05-20 10:13:06 -07:00
actual := it . RangeValues ( metric . Interval {
2014-08-14 09:23:49 -07:00
OldestInclusive : expected . Timestamp + 1 ,
NewestInclusive : expected . Timestamp + 1 ,
} )
if len ( actual ) != 0 {
t . Fatalf ( "2.%d. Expected no result, got %d." , i , len ( actual ) )
}
}
// #3 2sec interval around sample.
for i , expected := range samples {
2015-05-20 10:13:06 -07:00
actual := it . RangeValues ( metric . Interval {
2014-08-14 09:23:49 -07:00
OldestInclusive : expected . Timestamp - 1 ,
NewestInclusive : expected . Timestamp + 1 ,
} )
if len ( actual ) != 1 {
t . Fatalf ( "3.%d. Expected exactly one result, got %d." , i , len ( actual ) )
}
if expected . Timestamp != actual [ 0 ] . Timestamp {
t . Errorf ( "3.%d. Got %v; want %v." , i , actual [ 0 ] . Timestamp , expected . Timestamp )
}
if expected . Value != actual [ 0 ] . Value {
t . Errorf ( "3.%d. Got %v; want %v." , i , actual [ 0 ] . Value , expected . Value )
}
}
// #4 2sec interval sample to sample.
for i , expected1 := range samples {
if i == len ( samples ) - 1 {
continue
}
expected2 := samples [ i + 1 ]
2015-05-20 10:13:06 -07:00
actual := it . RangeValues ( metric . Interval {
2014-08-14 09:23:49 -07:00
OldestInclusive : expected1 . Timestamp ,
NewestInclusive : expected1 . Timestamp + 2 ,
} )
if len ( actual ) != 2 {
t . Fatalf ( "4.%d. Expected exactly 2 results, got %d." , i , len ( actual ) )
}
if expected1 . Timestamp != actual [ 0 ] . Timestamp {
t . Errorf ( "4.%d. Got %v for 1st result; want %v." , i , actual [ 0 ] . Timestamp , expected1 . Timestamp )
}
if expected1 . Value != actual [ 0 ] . Value {
t . Errorf ( "4.%d. Got %v for 1st result; want %v." , i , actual [ 0 ] . Value , expected1 . Value )
}
if expected2 . Timestamp != actual [ 1 ] . Timestamp {
t . Errorf ( "4.%d. Got %v for 2nd result; want %v." , i , actual [ 1 ] . Timestamp , expected2 . Timestamp )
}
if expected2 . Value != actual [ 1 ] . Value {
t . Errorf ( "4.%d. Got %v for 2nd result; want %v." , i , actual [ 1 ] . Value , expected2 . Value )
2014-06-06 02:55:53 -07:00
}
}
2014-08-14 09:23:49 -07:00
// #5 corner cases: Interval ends at first sample, interval starts
// at last sample, interval entirely before/after samples.
expected := samples [ 0 ]
2015-05-20 10:13:06 -07:00
actual := it . RangeValues ( metric . Interval {
2014-08-14 09:23:49 -07:00
OldestInclusive : expected . Timestamp - 2 ,
NewestInclusive : expected . Timestamp ,
} )
if len ( actual ) != 1 {
t . Fatalf ( "5.1. Expected exactly one result, got %d." , len ( actual ) )
}
if expected . Timestamp != actual [ 0 ] . Timestamp {
t . Errorf ( "5.1. Got %v; want %v." , actual [ 0 ] . Timestamp , expected . Timestamp )
}
if expected . Value != actual [ 0 ] . Value {
t . Errorf ( "5.1. Got %v; want %v." , actual [ 0 ] . Value , expected . Value )
}
expected = samples [ len ( samples ) - 1 ]
2015-05-20 10:13:06 -07:00
actual = it . RangeValues ( metric . Interval {
2014-08-14 09:23:49 -07:00
OldestInclusive : expected . Timestamp ,
NewestInclusive : expected . Timestamp + 2 ,
} )
if len ( actual ) != 1 {
t . Fatalf ( "5.2. Expected exactly one result, got %d." , len ( actual ) )
}
if expected . Timestamp != actual [ 0 ] . Timestamp {
t . Errorf ( "5.2. Got %v; want %v." , actual [ 0 ] . Timestamp , expected . Timestamp )
}
if expected . Value != actual [ 0 ] . Value {
t . Errorf ( "5.2. Got %v; want %v." , actual [ 0 ] . Value , expected . Value )
}
firstSample := samples [ 0 ]
2015-05-20 10:13:06 -07:00
actual = it . RangeValues ( metric . Interval {
2014-08-14 09:23:49 -07:00
OldestInclusive : firstSample . Timestamp - 4 ,
NewestInclusive : firstSample . Timestamp - 2 ,
} )
if len ( actual ) != 0 {
t . Fatalf ( "5.3. Expected no results, got %d." , len ( actual ) )
}
lastSample := samples [ len ( samples ) - 1 ]
2015-05-20 10:13:06 -07:00
actual = it . RangeValues ( metric . Interval {
2014-08-14 09:23:49 -07:00
OldestInclusive : lastSample . Timestamp + 2 ,
NewestInclusive : lastSample . Timestamp + 4 ,
} )
if len ( actual ) != 0 {
t . Fatalf ( "5.3. Expected no results, got %d." , len ( actual ) )
}
2014-06-06 02:55:53 -07:00
}
2015-05-20 10:13:06 -07:00
func TestRangeValuesChunkType0 ( t * testing . T ) {
testRangeValues ( t , 0 )
2015-03-04 04:40:18 -08:00
}
2015-05-20 10:13:06 -07:00
func TestRangeValuesChunkType1 ( t * testing . T ) {
testRangeValues ( t , 1 )
2015-03-04 04:40:18 -08:00
}
2016-03-12 12:34:51 -08:00
func TestRangeValuesChunkType2 ( t * testing . T ) {
testRangeValues ( t , 2 )
}
2016-09-21 14:44:27 -07:00
func benchmarkRangeValues ( b * testing . B , encoding chunk . Encoding ) {
2015-08-20 08:18:46 -07:00
samples := make ( model . Samples , 10000 )
2015-05-19 10:12:01 -07:00
for i := range samples {
2015-08-20 08:18:46 -07:00
samples [ i ] = & model . Sample {
Timestamp : model . Time ( 2 * i ) ,
Value : model . SampleValue ( float64 ( i ) * 0.2 ) ,
2015-05-19 10:12:01 -07:00
}
}
s , closer := NewTestStorage ( b , encoding )
defer closer . Close ( )
for _ , sample := range samples {
s . Append ( sample )
}
s . WaitForIndexing ( )
2015-08-20 08:18:46 -07:00
fp := model . Metric { } . FastFingerprint ( )
2015-05-19 10:12:01 -07:00
2016-09-18 04:20:46 -07:00
it := s . preloadChunksForRange ( makeFingerprintSeriesPair ( s , fp ) , model . Earliest , model . Latest )
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
2015-05-19 10:12:01 -07:00
b . ResetTimer ( )
for i := 0 ; i < b . N ; i ++ {
for _ , sample := range samples {
2015-05-20 10:13:06 -07:00
actual := it . RangeValues ( metric . Interval {
2015-05-19 10:12:01 -07:00
OldestInclusive : sample . Timestamp - 20 ,
NewestInclusive : sample . Timestamp + 20 ,
} )
if len ( actual ) < 10 {
b . Fatalf ( "not enough samples found" )
}
}
}
}
2015-05-20 10:13:06 -07:00
func BenchmarkRangeValuesChunkType0 ( b * testing . B ) {
benchmarkRangeValues ( b , 0 )
2015-05-19 10:12:01 -07:00
}
2015-05-20 10:13:06 -07:00
func BenchmarkRangeValuesChunkType1 ( b * testing . B ) {
benchmarkRangeValues ( b , 1 )
2015-03-04 04:40:18 -08:00
}
2016-03-12 12:34:51 -08:00
func BenchmarkRangeValuesChunkType2 ( b * testing . B ) {
benchmarkRangeValues ( b , 2 )
}
2016-09-21 14:44:27 -07:00
func testEvictAndPurgeSeries ( t * testing . T , encoding chunk . Encoding ) {
2015-08-20 08:18:46 -07:00
samples := make ( model . Samples , 10000 )
2014-10-28 11:01:41 -07:00
for i := range samples {
2015-08-20 08:18:46 -07:00
samples [ i ] = & model . Sample {
Timestamp : model . Time ( 2 * i ) ,
Value : model . SampleValue ( float64 ( i * i ) ) ,
2014-10-28 11:01:41 -07:00
}
}
2015-03-13 07:49:07 -07:00
s , closer := NewTestStorage ( t , encoding )
2014-10-28 11:01:41 -07:00
defer closer . Close ( )
2015-03-14 19:36:15 -07:00
for _ , sample := range samples {
s . Append ( sample )
}
2015-02-12 08:23:42 -08:00
s . WaitForIndexing ( )
2014-10-28 11:01:41 -07:00
2015-08-20 08:18:46 -07:00
fp := model . Metric { } . FastFingerprint ( )
2014-10-28 11:01:41 -07:00
2015-02-26 06:19:44 -08:00
// Drop ~half of the chunks.
2015-05-19 10:12:01 -07:00
s . maintainMemorySeries ( fp , 10000 )
2016-09-18 04:20:46 -07:00
it := s . preloadChunksForRange ( makeFingerprintSeriesPair ( s , fp ) , model . Earliest , model . Latest )
2016-02-24 08:16:24 -08:00
actual := it . RangeValues ( metric . Interval {
2014-10-28 11:01:41 -07:00
OldestInclusive : 0 ,
2015-05-19 10:12:01 -07:00
NewestInclusive : 100000 ,
2014-10-28 11:01:41 -07:00
} )
2016-02-24 08:16:24 -08:00
if len ( actual ) < 4000 {
t . Fatalf ( "expected more than %d results after purging half of series, got %d" , 4000 , len ( actual ) )
2014-10-28 11:01:41 -07:00
}
2015-05-19 10:12:01 -07:00
if actual [ 0 ] . Timestamp < 6000 || actual [ 0 ] . Timestamp > 10000 {
2014-10-28 11:01:41 -07:00
t . Errorf ( "1st timestamp out of expected range: %v" , actual [ 0 ] . Timestamp )
}
2015-08-20 08:18:46 -07:00
want := model . Time ( 19998 )
2016-02-24 08:16:24 -08:00
if actual [ len ( actual ) - 1 ] . Timestamp != want {
2014-10-28 11:01:41 -07:00
t . Errorf ( "2nd timestamp: want %v, got %v" , want , actual [ 1 ] . Timestamp )
}
2015-02-26 06:19:44 -08:00
// Drop everything.
2015-05-19 10:12:01 -07:00
s . maintainMemorySeries ( fp , 100000 )
2016-09-18 04:20:46 -07:00
it = s . preloadChunksForRange ( makeFingerprintSeriesPair ( s , fp ) , model . Earliest , model . Latest )
2016-02-24 08:16:24 -08:00
actual = it . RangeValues ( metric . Interval {
2014-10-28 11:01:41 -07:00
OldestInclusive : 0 ,
2015-05-19 10:12:01 -07:00
NewestInclusive : 100000 ,
2014-10-28 11:01:41 -07:00
} )
if len ( actual ) != 0 {
t . Fatal ( "expected zero results after purging the whole series" )
}
// Recreate series.
2015-03-14 19:36:15 -07:00
for _ , sample := range samples {
s . Append ( sample )
}
2015-02-12 08:23:42 -08:00
s . WaitForIndexing ( )
2014-10-28 11:01:41 -07:00
2015-05-06 07:53:12 -07:00
series , ok := s . fpToSeries . get ( fp )
2014-10-28 11:01:41 -07:00
if ! ok {
t . Fatal ( "could not find series" )
}
2014-11-13 11:50:25 -08:00
// Persist head chunk so we can safely archive.
2015-03-08 18:33:10 -07:00
series . headChunkClosed = true
2015-08-20 08:18:46 -07:00
s . maintainMemorySeries ( fp , model . Earliest )
2014-10-28 11:01:41 -07:00
2014-11-13 11:50:25 -08:00
// Archive metrics.
2015-05-06 07:53:12 -07:00
s . fpToSeries . del ( fp )
2016-09-21 14:44:27 -07:00
lastTime , err := series . head ( ) . LastTime ( )
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 03:23:42 -08:00
if err != nil {
t . Fatal ( err )
}
2016-09-26 04:06:06 -07:00
s . persistence . archiveMetric ( fp , series . metric , series . firstTime ( ) , lastTime )
2016-03-09 09:56:30 -08:00
archived , _ , _ := s . persistence . hasArchivedMetric ( fp )
2014-10-28 11:01:41 -07:00
if ! archived {
t . Fatal ( "not archived" )
}
2015-02-26 06:19:44 -08:00
// Drop ~half of the chunks of an archived series.
2015-05-19 10:12:01 -07:00
s . maintainArchivedSeries ( fp , 10000 )
2016-03-09 09:56:30 -08:00
archived , _ , _ = s . persistence . hasArchivedMetric ( fp )
2014-10-28 11:01:41 -07:00
if ! archived {
2015-02-26 06:19:44 -08:00
t . Fatal ( "archived series purged although only half of the chunks dropped" )
2014-10-28 11:01:41 -07:00
}
2015-02-26 06:19:44 -08:00
// Drop everything.
2015-05-19 10:12:01 -07:00
s . maintainArchivedSeries ( fp , 100000 )
2016-03-09 09:56:30 -08:00
archived , _ , _ = s . persistence . hasArchivedMetric ( fp )
2014-10-28 11:01:41 -07:00
if archived {
t . Fatal ( "archived series not dropped" )
}
2015-04-09 06:57:11 -07:00
// Recreate series.
for _ , sample := range samples {
s . Append ( sample )
}
s . WaitForIndexing ( )
2015-05-06 07:53:12 -07:00
series , ok = s . fpToSeries . get ( fp )
2015-04-09 06:57:11 -07:00
if ! ok {
t . Fatal ( "could not find series" )
}
// Persist head chunk so we can safely archive.
series . headChunkClosed = true
2015-08-20 08:18:46 -07:00
s . maintainMemorySeries ( fp , model . Earliest )
2015-04-09 06:57:11 -07:00
// Archive metrics.
2015-05-06 07:53:12 -07:00
s . fpToSeries . del ( fp )
2016-09-21 14:44:27 -07:00
lastTime , err = series . head ( ) . LastTime ( )
Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...
The ideas behind this are the following:
- panic only if it's a programming error. Data corruptions happen, and
they are not programming errors.
- If we detect a data corruption, we "quarantine" the series,
essentially removing it from the database and putting its data into
a separate directory for forensics.
- Failure during writing to a series file is not considered corruption
automatically. It will call setDirty, though, so that a
crashrecovery upon the next restart will commence and check for
that.
- Series quarantining and setDirty calls are logged and counted in
metrics, but are hidden from the user of the interfaces in
interface.go, whith the notable exception of Append(). The reasoning
is that we treat corruption by removing the corrupted series, i.e. a
query for it will return no results on its next call anyway, so
return no results right now. In the case of Append(), we want to
tell the user that no data has been appended, though.
Minor side effects:
- Now consistently using filepath.* instead of path.*.
- Introduced structured logging where I touched it. This makes things
less consistent, but a complete change to structured logging would
be out of scope for this PR.
2016-02-25 03:23:42 -08:00
if err != nil {
t . Fatal ( err )
}
2016-09-26 04:06:06 -07:00
s . persistence . archiveMetric ( fp , series . metric , series . firstTime ( ) , lastTime )
2016-03-09 09:56:30 -08:00
archived , _ , _ = s . persistence . hasArchivedMetric ( fp )
2015-04-09 06:57:11 -07:00
if ! archived {
t . Fatal ( "not archived" )
}
// Unarchive metrics.
2015-08-20 08:18:46 -07:00
s . getOrCreateSeries ( fp , model . Metric { } )
2015-04-09 06:57:11 -07:00
2015-05-06 07:53:12 -07:00
series , ok = s . fpToSeries . get ( fp )
2015-04-09 06:57:11 -07:00
if ! ok {
t . Fatal ( "could not find series" )
}
2016-03-09 09:56:30 -08:00
archived , _ , _ = s . persistence . hasArchivedMetric ( fp )
2015-04-09 06:57:11 -07:00
if archived {
t . Fatal ( "archived" )
}
2016-03-09 11:27:50 -08:00
// Set archiveHighWatermark to a low value so that we can see it increase.
s . archiveHighWatermark = 42
2015-04-09 06:57:11 -07:00
// This will archive again, but must not drop it completely, despite the
// memorySeries being empty.
2015-05-19 10:12:01 -07:00
s . maintainMemorySeries ( fp , 10000 )
2016-03-09 09:56:30 -08:00
archived , _ , _ = s . persistence . hasArchivedMetric ( fp )
2015-04-09 06:57:11 -07:00
if ! archived {
t . Fatal ( "series purged completely" )
}
2016-03-09 11:27:50 -08:00
// archiveHighWatermark must have been set by maintainMemorySeries.
if want , got := model . Time ( 19998 ) , s . archiveHighWatermark ; want != got {
t . Errorf ( "want archiveHighWatermark %v, got %v" , want , got )
}
2014-10-28 11:01:41 -07:00
}
2015-03-04 04:40:18 -08:00
func TestEvictAndPurgeSeriesChunkType0 ( t * testing . T ) {
testEvictAndPurgeSeries ( t , 0 )
}
func TestEvictAndPurgeSeriesChunkType1 ( t * testing . T ) {
testEvictAndPurgeSeries ( t , 1 )
}
2016-03-12 12:34:51 -08:00
func TestEvictAndPurgeSeriesChunkType2 ( t * testing . T ) {
testEvictAndPurgeSeries ( t , 2 )
}
2016-09-21 14:44:27 -07:00
func testEvictAndLoadChunkDescs ( t * testing . T , encoding chunk . Encoding ) {
2015-08-20 08:18:46 -07:00
samples := make ( model . Samples , 10000 )
2015-07-15 10:53:15 -07:00
for i := range samples {
2015-08-20 08:18:46 -07:00
samples [ i ] = & model . Sample {
Timestamp : model . Time ( 2 * i ) ,
Value : model . SampleValue ( float64 ( i * i ) ) ,
2015-07-15 10:53:15 -07:00
}
}
// Give last sample a timestamp of now so that the head chunk will not
// be closed (which would then archive the time series later as
// everything will get evicted).
2015-08-20 08:18:46 -07:00
samples [ len ( samples ) - 1 ] = & model . Sample {
Timestamp : model . Now ( ) ,
Value : model . SampleValue ( 3.14 ) ,
2015-07-15 10:53:15 -07:00
}
2016-10-10 07:30:10 -07:00
// Sadly, chunk.NumMemChunks is a global variable. We have to reset it
// explicitly here.
atomic . StoreInt64 ( & chunk . NumMemChunks , 0 )
2015-07-15 10:53:15 -07:00
s , closer := NewTestStorage ( t , encoding )
defer closer . Close ( )
storage: Evict chunks and calculate persistence pressure based on target heap size
This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".
The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).
This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.
Answers to anticipated comments:
There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.
One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.
2017-03-01 06:17:31 -08:00
// Adjust target heap size to lower value to see evictions.
s . targetHeapSize = 1000000
2015-07-15 10:53:15 -07:00
for _ , sample := range samples {
s . Append ( sample )
}
s . WaitForIndexing ( )
2015-08-20 08:18:46 -07:00
fp := model . Metric { } . FastFingerprint ( )
2015-07-15 10:53:15 -07:00
series , ok := s . fpToSeries . get ( fp )
if ! ok {
t . Fatal ( "could not find series" )
}
oldLen := len ( series . chunkDescs )
// Maintain series without any dropped chunks.
s . maintainMemorySeries ( fp , 0 )
// Give the evict goroutine an opportunity to run.
storage: Evict chunks and calculate persistence pressure based on target heap size
This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".
The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).
This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.
Answers to anticipated comments:
There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.
One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.
2017-03-01 06:17:31 -08:00
time . Sleep ( 1250 * time . Millisecond )
2016-09-28 14:33:34 -07:00
// Maintain series again to trigger chunk.Desc eviction.
2015-07-15 10:53:15 -07:00
s . maintainMemorySeries ( fp , 0 )
if oldLen <= len ( series . chunkDescs ) {
t . Errorf ( "Expected number of chunkDescs to decrease, old number %d, current number %d." , oldLen , len ( series . chunkDescs ) )
}
2016-10-10 07:30:10 -07:00
if int64 ( len ( series . chunkDescs ) ) < atomic . LoadInt64 ( & chunk . NumMemChunks ) {
t . Errorf ( "NumMemChunks is larger than number of chunk descs, number of chunk descs: %d, NumMemChunks: %d." , len ( series . chunkDescs ) , atomic . LoadInt64 ( & chunk . NumMemChunks ) )
}
2015-07-15 10:53:15 -07:00
// Load everything back.
2016-09-18 04:20:46 -07:00
it := s . preloadChunksForRange ( makeFingerprintSeriesPair ( s , fp ) , 0 , 100000 )
2015-07-15 10:53:15 -07:00
if oldLen != len ( series . chunkDescs ) {
t . Errorf ( "Expected number of chunkDescs to have reached old value again, old number %d, current number %d." , oldLen , len ( series . chunkDescs ) )
}
2016-07-11 11:27:25 -07:00
it . Close ( )
2015-07-15 10:53:15 -07:00
// Now maintain series with drops to make sure nothing crazy happens.
s . maintainMemorySeries ( fp , 100000 )
if len ( series . chunkDescs ) != 1 {
2016-09-28 14:33:34 -07:00
t . Errorf ( "Expected exactly one chunk.Desc left, got %d." , len ( series . chunkDescs ) )
2015-07-15 10:53:15 -07:00
}
}
func TestEvictAndLoadChunkDescsType0 ( t * testing . T ) {
testEvictAndLoadChunkDescs ( t , 0 )
}
func TestEvictAndLoadChunkDescsType1 ( t * testing . T ) {
testEvictAndLoadChunkDescs ( t , 1 )
}
2016-09-21 14:44:27 -07:00
func benchmarkAppend ( b * testing . B , encoding chunk . Encoding ) {
2015-08-20 08:18:46 -07:00
samples := make ( model . Samples , b . N )
2014-06-06 02:55:53 -07:00
for i := range samples {
2015-08-20 08:18:46 -07:00
samples [ i ] = & model . Sample {
Metric : model . Metric {
model . MetricNameLabel : model . LabelValue ( fmt . Sprintf ( "test_metric_%d" , i % 10 ) ) ,
"label1" : model . LabelValue ( fmt . Sprintf ( "test_metric_%d" , i % 10 ) ) ,
"label2" : model . LabelValue ( fmt . Sprintf ( "test_metric_%d" , i % 10 ) ) ,
2014-06-06 02:55:53 -07:00
} ,
2015-08-20 08:18:46 -07:00
Timestamp : model . Time ( i ) ,
Value : model . SampleValue ( i ) ,
2014-06-06 02:55:53 -07:00
}
}
b . ResetTimer ( )
2015-03-13 07:49:07 -07:00
s , closer := NewTestStorage ( b , encoding )
2014-06-06 02:55:53 -07:00
defer closer . Close ( )
2015-03-14 19:36:15 -07:00
for _ , sample := range samples {
s . Append ( sample )
}
2014-06-06 02:55:53 -07:00
}
2014-08-14 09:23:49 -07:00
2015-03-04 04:40:18 -08:00
func BenchmarkAppendType0 ( b * testing . B ) {
benchmarkAppend ( b , 0 )
}
func BenchmarkAppendType1 ( b * testing . B ) {
benchmarkAppend ( b , 1 )
}
2016-03-23 08:30:41 -07:00
func BenchmarkAppendType2 ( b * testing . B ) {
benchmarkAppend ( b , 2 )
}
2014-10-28 11:01:41 -07:00
// Append a large number of random samples and then check if we can get them out
// of the storage alright.
2016-09-21 14:44:27 -07:00
func testFuzz ( t * testing . T , encoding chunk . Encoding ) {
2014-10-28 11:01:41 -07:00
if testing . Short ( ) {
t . Skip ( "Skipping test in short mode." )
}
2014-08-14 09:23:49 -07:00
2014-10-28 11:01:41 -07:00
check := func ( seed int64 ) bool {
rand . Seed ( seed )
2015-03-13 07:49:07 -07:00
s , c := NewTestStorage ( t , encoding )
2014-08-14 09:23:49 -07:00
defer c . Close ( )
2015-05-19 10:12:01 -07:00
samples := createRandomSamples ( "test_fuzz" , 10000 )
2015-03-14 19:36:15 -07:00
for _ , sample := range samples {
s . Append ( sample )
}
2016-03-20 09:14:47 -07:00
if ! verifyStorageRandom ( t , s , samples ) {
return false
}
return verifyStorageSequential ( t , s , samples )
2014-08-14 09:23:49 -07:00
}
if err := quick . Check ( check , nil ) ; err != nil {
t . Fatal ( err )
}
}
2015-03-04 04:40:18 -08:00
func TestFuzzChunkType0 ( t * testing . T ) {
testFuzz ( t , 0 )
}
func TestFuzzChunkType1 ( t * testing . T ) {
testFuzz ( t , 1 )
}
2016-03-12 12:34:51 -08:00
func TestFuzzChunkType2 ( t * testing . T ) {
testFuzz ( t , 2 )
}
2015-03-04 04:40:18 -08:00
// benchmarkFuzz is the benchmark version of testFuzz. The storage options are
// set such that evictions, checkpoints, and purging will happen concurrently,
// too. This benchmark will have a very long runtime (up to minutes). You can
// use it as an actual benchmark. Run it like this:
2014-10-28 11:01:41 -07:00
//
2015-03-08 18:33:10 -07:00
// go test -cpu 1,2,4,8 -run=NONE -bench BenchmarkFuzzChunkType -benchmem
2014-10-28 11:01:41 -07:00
//
// You can also use it as a test for races. In that case, run it like this (will
// make things even slower):
//
2015-03-08 18:33:10 -07:00
// go test -race -cpu 8 -short -bench BenchmarkFuzzChunkType
2016-09-21 14:44:27 -07:00
func benchmarkFuzz ( b * testing . B , encoding chunk . Encoding ) {
chunk . DefaultEncoding = encoding
2015-03-06 07:03:03 -08:00
const samplesPerRun = 100000
2014-10-28 11:01:41 -07:00
rand . Seed ( 42 )
2015-05-28 11:58:38 -07:00
directory := testutil . NewTemporaryDirectory ( "test_storage" , b )
2014-10-28 11:01:41 -07:00
defer directory . Close ( )
o := & MemorySeriesStorageOptions {
storage: Evict chunks and calculate persistence pressure based on target heap size
This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".
The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).
This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.
Answers to anticipated comments:
There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.
One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.
2017-03-01 06:17:31 -08:00
TargetHeapSize : 200000 ,
2014-10-28 11:01:41 -07:00
PersistenceRetentionPeriod : time . Hour ,
PersistenceStoragePath : directory . Path ( ) ,
storage: Use staleness delta as head chunk timeout
Currently, if a series stops to exist, its head chunk will be kept
open for an hour. That prevents it from being persisted. Which
prevents it from being evicted. Which prevents the series from being
archived.
Most of the time, once no sample has been added to a series within the
staleness limit, we can be pretty confident that this series will not
receive samples anymore. The whole chain as described above can be
started after 5m instead of 1h. In the relaxed case, this doesn't
change a lot as the head chunk timeout is only checked during series
maintenance, and usually, a series is only maintained every six
hours. However, there is the typical scenario where a large service is
deployed, the deoply turns out to be bad, and then it is deployed
again within minutes, and quite quickly the number of time series has
tripled. That's the point where the Prometheus server is stressed and
switches (rightfully) into rushed mode. In that mode, time series are
processed as quickly as possible, but all of that is in vein if all of
those recently ended time series cannot be persisted yet for another
hour. In that scenario, this change will help most, and it's exactly
the scenario where help is most desperately needed.
2017-03-26 14:44:50 -07:00
HeadChunkTimeout : 5 * time . Minute ,
2015-03-04 04:40:18 -08:00
CheckpointInterval : time . Second ,
2015-03-19 07:41:50 -07:00
SyncStrategy : Adaptive ,
2016-01-11 07:42:10 -08:00
MinShrinkRatio : 0.1 ,
2014-10-28 11:01:41 -07:00
}
2015-05-18 10:26:28 -07:00
s := NewMemorySeriesStorage ( o )
if err := s . Start ( ) ; err != nil {
b . Fatalf ( "Error starting storage: %s" , err )
2014-10-28 11:01:41 -07:00
}
s . Start ( )
defer s . Stop ( )
2015-03-04 04:40:18 -08:00
samples := createRandomSamples ( "benchmark_fuzz" , samplesPerRun * b . N )
b . ResetTimer ( )
for i := 0 ; i < b . N ; i ++ {
start := samplesPerRun * i
end := samplesPerRun * ( i + 1 )
middle := ( start + end ) / 2
2015-03-14 19:36:15 -07:00
for _ , sample := range samples [ start : middle ] {
s . Append ( sample )
}
2016-06-23 04:03:41 -07:00
verifyStorageRandom ( b , s , samples [ : middle ] )
2015-03-14 19:36:15 -07:00
for _ , sample := range samples [ middle : end ] {
s . Append ( sample )
}
2016-06-23 04:03:41 -07:00
verifyStorageRandom ( b , s , samples [ : end ] )
verifyStorageSequential ( b , s , samples )
2015-03-04 04:40:18 -08:00
}
}
func BenchmarkFuzzChunkType0 ( b * testing . B ) {
benchmarkFuzz ( b , 0 )
}
func BenchmarkFuzzChunkType1 ( b * testing . B ) {
benchmarkFuzz ( b , 1 )
2014-10-28 11:01:41 -07:00
}
2016-03-12 12:34:51 -08:00
func BenchmarkFuzzChunkType2 ( b * testing . B ) {
benchmarkFuzz ( b , 2 )
}
2015-08-20 08:18:46 -07:00
func createRandomSamples ( metricName string , minLen int ) model . Samples {
type valueCreator func ( ) model . SampleValue
type deltaApplier func ( model . SampleValue ) model . SampleValue
2014-08-14 09:23:49 -07:00
var (
2016-03-20 09:14:47 -07:00
maxMetrics = 5
maxStreakLength = 2000
maxTimeDelta = 10000
timestamp = model . Now ( ) - model . Time ( maxTimeDelta * minLen ) // So that some timestamps are in the future.
generators = [ ] struct {
2014-08-14 09:23:49 -07:00
createValue valueCreator
applyDelta [ ] deltaApplier
} {
{ // "Boolean".
2015-08-20 08:18:46 -07:00
createValue : func ( ) model . SampleValue {
return model . SampleValue ( rand . Intn ( 2 ) )
2014-08-14 09:23:49 -07:00
} ,
applyDelta : [ ] deltaApplier {
2015-08-20 08:18:46 -07:00
func ( _ model . SampleValue ) model . SampleValue {
return model . SampleValue ( rand . Intn ( 2 ) )
2014-08-14 09:23:49 -07:00
} ,
} ,
} ,
{ // Integer with int deltas of various byte length.
2015-08-20 08:18:46 -07:00
createValue : func ( ) model . SampleValue {
return model . SampleValue ( rand . Int63 ( ) - 1 << 62 )
2014-08-14 09:23:49 -07:00
} ,
applyDelta : [ ] deltaApplier {
2015-08-20 08:18:46 -07:00
func ( v model . SampleValue ) model . SampleValue {
return model . SampleValue ( rand . Intn ( 1 << 8 ) - 1 << 7 + int ( v ) )
2014-08-14 09:23:49 -07:00
} ,
2015-08-20 08:18:46 -07:00
func ( v model . SampleValue ) model . SampleValue {
return model . SampleValue ( rand . Intn ( 1 << 16 ) - 1 << 15 + int ( v ) )
2014-08-14 09:23:49 -07:00
} ,
2015-08-20 08:18:46 -07:00
func ( v model . SampleValue ) model . SampleValue {
return model . SampleValue ( rand . Int63n ( 1 << 32 ) - 1 << 31 + int64 ( v ) )
2014-08-14 09:23:49 -07:00
} ,
} ,
} ,
{ // Float with float32 and float64 deltas.
2015-08-20 08:18:46 -07:00
createValue : func ( ) model . SampleValue {
return model . SampleValue ( rand . NormFloat64 ( ) )
2014-08-14 09:23:49 -07:00
} ,
applyDelta : [ ] deltaApplier {
2015-08-20 08:18:46 -07:00
func ( v model . SampleValue ) model . SampleValue {
return v + model . SampleValue ( float32 ( rand . NormFloat64 ( ) ) )
2014-08-14 09:23:49 -07:00
} ,
2015-08-20 08:18:46 -07:00
func ( v model . SampleValue ) model . SampleValue {
return v + model . SampleValue ( rand . NormFloat64 ( ) )
2014-08-14 09:23:49 -07:00
} ,
} ,
} ,
}
2016-03-20 09:14:47 -07:00
timestampIncrementers = [ ] func ( baseDelta model . Time ) model . Time {
// Regular increments.
func ( delta model . Time ) model . Time {
return delta
} ,
// Jittered increments. σ is 1/100 of delta, e.g. 10ms for 10s scrape interval.
func ( delta model . Time ) model . Time {
return delta + model . Time ( rand . NormFloat64 ( ) * float64 ( delta ) / 100 )
} ,
// Regular increments, but missing a scrape with 10% chance.
func ( delta model . Time ) model . Time {
i := rand . Intn ( 100 )
if i < 90 {
return delta
}
if i < 99 {
return 2 * delta
}
return 3 * delta
// Ignoring the case with more than two missed scrapes in a row.
} ,
}
2014-08-14 09:23:49 -07:00
)
2015-05-11 08:15:30 -07:00
// Prefill result with two samples with colliding metrics (to test fingerprint mapping).
2015-08-20 08:18:46 -07:00
result := model . Samples {
& model . Sample {
Metric : model . Metric {
2015-05-06 07:53:12 -07:00
"instance" : "ip-10-33-84-73.l05.ams5.s-cloud.net:24483" ,
"status" : "503" ,
} ,
Value : 42 ,
Timestamp : timestamp ,
} ,
2015-08-20 08:18:46 -07:00
& model . Sample {
Metric : model . Metric {
2015-05-06 07:53:12 -07:00
"instance" : "ip-10-33-84-73.l05.ams5.s-cloud.net:24480" ,
"status" : "500" ,
} ,
Value : 2010 ,
Timestamp : timestamp + 1 ,
} ,
}
2014-08-14 09:23:49 -07:00
2015-08-20 08:18:46 -07:00
metrics := [ ] model . Metric { }
2014-10-28 11:01:41 -07:00
for n := rand . Intn ( maxMetrics ) ; n >= 0 ; n -- {
2015-08-20 08:18:46 -07:00
metrics = append ( metrics , model . Metric {
model . MetricNameLabel : model . LabelValue ( metricName ) ,
model . LabelName ( fmt . Sprintf ( "labelname_%d" , n + 1 ) ) : model . LabelValue ( fmt . Sprintf ( "labelvalue_%d" , rand . Int ( ) ) ) ,
2014-08-14 09:23:49 -07:00
} )
}
2015-03-04 04:40:18 -08:00
for len ( result ) < minLen {
2016-03-20 09:14:47 -07:00
var (
// Pick a metric for this cycle.
metric = metrics [ rand . Intn ( len ( metrics ) ) ]
timeDelta = model . Time ( rand . Intn ( maxTimeDelta ) + 1 )
generator = generators [ rand . Intn ( len ( generators ) ) ]
createValue = generator . createValue
applyDelta = generator . applyDelta [ rand . Intn ( len ( generator . applyDelta ) ) ]
incTimestamp = timestampIncrementers [ rand . Intn ( len ( timestampIncrementers ) ) ]
)
2014-10-28 11:01:41 -07:00
switch rand . Intn ( 4 ) {
2014-08-14 09:23:49 -07:00
case 0 : // A single sample.
2015-08-20 08:18:46 -07:00
result = append ( result , & model . Sample {
2014-08-14 09:23:49 -07:00
Metric : metric ,
Value : createValue ( ) ,
2014-10-28 11:01:41 -07:00
Timestamp : timestamp ,
2014-08-14 09:23:49 -07:00
} )
2016-03-20 09:14:47 -07:00
timestamp += incTimestamp ( timeDelta )
2014-08-14 09:23:49 -07:00
case 1 : // A streak of random sample values.
2014-10-28 11:01:41 -07:00
for n := rand . Intn ( maxStreakLength ) ; n >= 0 ; n -- {
2015-08-20 08:18:46 -07:00
result = append ( result , & model . Sample {
2014-08-14 09:23:49 -07:00
Metric : metric ,
Value : createValue ( ) ,
2014-10-28 11:01:41 -07:00
Timestamp : timestamp ,
2014-08-14 09:23:49 -07:00
} )
2016-03-20 09:14:47 -07:00
timestamp += incTimestamp ( timeDelta )
2014-08-14 09:23:49 -07:00
}
case 2 : // A streak of sample values with incremental changes.
value := createValue ( )
2014-10-28 11:01:41 -07:00
for n := rand . Intn ( maxStreakLength ) ; n >= 0 ; n -- {
2015-08-20 08:18:46 -07:00
result = append ( result , & model . Sample {
2014-08-14 09:23:49 -07:00
Metric : metric ,
Value : value ,
2014-10-28 11:01:41 -07:00
Timestamp : timestamp ,
2014-08-14 09:23:49 -07:00
} )
2016-03-20 09:14:47 -07:00
timestamp += incTimestamp ( timeDelta )
2014-08-14 09:23:49 -07:00
value = applyDelta ( value )
}
case 3 : // A streak of constant sample values.
value := createValue ( )
2014-10-28 11:01:41 -07:00
for n := rand . Intn ( maxStreakLength ) ; n >= 0 ; n -- {
2015-08-20 08:18:46 -07:00
result = append ( result , & model . Sample {
2014-08-14 09:23:49 -07:00
Metric : metric ,
Value : value ,
2014-10-28 11:01:41 -07:00
Timestamp : timestamp ,
2014-08-14 09:23:49 -07:00
} )
2016-03-20 09:14:47 -07:00
timestamp += incTimestamp ( timeDelta )
2014-08-14 09:23:49 -07:00
}
}
}
return result
}
2016-06-28 23:14:23 -07:00
func verifyStorageRandom ( t testing . TB , s * MemorySeriesStorage , samples model . Samples ) bool {
2015-03-04 04:40:18 -08:00
s . WaitForIndexing ( )
2014-08-14 09:23:49 -07:00
result := true
2014-10-28 11:01:41 -07:00
for _ , i := range rand . Perm ( len ( samples ) ) {
2014-08-14 09:23:49 -07:00
sample := samples [ i ]
Checkpoint fingerprint mappings only upon shutdown
Before, we checkpointed after every newly detected fingerprint
collision, which is not a problem as long as collisions are
rare. However, with a sufficient number of metrics or particular
nature of the data set, there might be a lot of collisions, all to be
detected upon the first set of scrapes, and then the checkpointing
after each detection will take a quite long time (it's O(n²),
essentially).
Since we are rebuilding the fingerprint mapping during crash recovery,
the previous, very conservative approach didn't even buy us
anything. We only ever read from the checkpoint file after a clean
shutdown, so the only time we need to write the checkpoint file is
during a clean shutdown.
2016-04-14 07:02:37 -07:00
fp := s . mapper . mapFP ( sample . Metric . FastFingerprint ( ) , sample . Metric )
2016-09-18 04:20:46 -07:00
it := s . preloadChunksForInstant ( makeFingerprintSeriesPair ( s , fp ) , sample . Timestamp , sample . Timestamp )
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
found := it . ValueAtOrBeforeTime ( sample . Timestamp )
2016-03-20 09:14:47 -07:00
startTime := it . ( * boundedIterator ) . start
switch {
case found . Timestamp != model . Earliest && sample . Timestamp . Before ( startTime ) :
t . Errorf ( "Sample #%d %#v: Expected outdated sample to be excluded." , i , sample )
result = false
case found . Timestamp == model . Earliest && ! sample . Timestamp . Before ( startTime ) :
2016-03-12 12:34:51 -08:00
t . Errorf ( "Sample #%d %#v: Expected sample not found." , i , sample )
2014-10-28 11:01:41 -07:00
result = false
2016-03-20 09:14:47 -07:00
case found . Timestamp == model . Earliest && sample . Timestamp . Before ( startTime ) :
// All good. Outdated sample dropped.
case sample . Value != found . Value || sample . Timestamp != found . Timestamp :
t . Errorf (
"Sample #%d %#v: Value (or timestamp) mismatch, want %f (at time %v), got %f (at time %v)." ,
i , sample , sample . Value , sample . Timestamp , found . Value , found . Timestamp ,
)
result = false
}
2016-07-11 11:27:25 -07:00
it . Close ( )
2016-03-20 09:14:47 -07:00
}
return result
}
2016-06-28 23:14:23 -07:00
func verifyStorageSequential ( t testing . TB , s * MemorySeriesStorage , samples model . Samples ) bool {
2016-03-20 09:14:47 -07:00
s . WaitForIndexing ( )
var (
result = true
fp model . Fingerprint
it SeriesIterator
r [ ] model . SamplePair
j int
)
defer func ( ) {
2016-07-11 11:27:25 -07:00
it . Close ( )
2016-03-20 09:14:47 -07:00
} ( )
for i , sample := range samples {
Checkpoint fingerprint mappings only upon shutdown
Before, we checkpointed after every newly detected fingerprint
collision, which is not a problem as long as collisions are
rare. However, with a sufficient number of metrics or particular
nature of the data set, there might be a lot of collisions, all to be
detected upon the first set of scrapes, and then the checkpointing
after each detection will take a quite long time (it's O(n²),
essentially).
Since we are rebuilding the fingerprint mapping during crash recovery,
the previous, very conservative approach didn't even buy us
anything. We only ever read from the checkpoint file after a clean
shutdown, so the only time we need to write the checkpoint file is
during a clean shutdown.
2016-04-14 07:02:37 -07:00
newFP := s . mapper . mapFP ( sample . Metric . FastFingerprint ( ) , sample . Metric )
2016-03-20 09:14:47 -07:00
if it == nil || newFP != fp {
fp = newFP
2016-07-11 11:27:25 -07:00
if it != nil {
it . Close ( )
}
2016-09-18 04:20:46 -07:00
it = s . preloadChunksForRange ( makeFingerprintSeriesPair ( s , fp ) , sample . Timestamp , model . Latest )
2016-03-20 09:14:47 -07:00
r = it . RangeValues ( metric . Interval {
OldestInclusive : sample . Timestamp ,
NewestInclusive : model . Latest ,
} )
j = - 1
}
startTime := it . ( * boundedIterator ) . start
if sample . Timestamp . Before ( startTime ) {
continue
}
j ++
if j >= len ( r ) {
t . Errorf (
"Sample #%d %v not found." ,
i , sample ,
)
result = false
2014-10-28 11:01:41 -07:00
continue
2014-08-14 09:23:49 -07:00
}
2016-03-20 09:14:47 -07:00
found := r [ j ]
Streamline series iterator creation
This will fix issue #1035 and will also help to make issue #1264 less
bad.
The fundamental problem in the current code:
In the preload phase, we quite accurately determine which chunks will
be used for the query being executed. However, in the subsequent step
of creating series iterators, the created iterators are referencing
_all_ in-memory chunks in their series, even the un-pinned ones. In
iterator creation, we copy a pointer to each in-memory chunk of a
series into the iterator. While this creates a certain amount of
allocation churn, the worst thing about it is that copying the chunk
pointer out of the chunkDesc requires a mutex acquisition. (Remember
that the iterator will also reference un-pinned chunks, so we need to
acquire the mutex to protect against concurrent eviction.) The worst
case happens if a series doesn't even contain any relevant samples for
the query time range. We notice that during preloading but then we
will still create a series iterator for it. But even for series that
do contain relevant samples, the overhead is quite bad for instant
queries that retrieve a single sample from each series, but still go
through all the effort of series iterator creation. All of that is
particularly bad if a series has many in-memory chunks.
This commit addresses the problem from two sides:
First, it merges preloading and iterator creation into one step,
i.e. the preload call returns an iterator for exactly the preloaded
chunks.
Second, the required mutex acquisition in chunkDesc has been greatly
reduced. That was enabled by a side effect of the first step, which is
that the iterator is only referencing pinned chunks, so there is no
risk of concurrent eviction anymore, and chunks can be accessed
without mutex acquisition.
To simplify the code changes for the above, the long-planned change of
ValueAtTime to ValueAtOrBefore time was performed at the same
time. (It should have been done first, but it kind of accidentally
happened while I was in the middle of writing the series iterator
changes. Sorry for that.) So far, we actively filtered the up to two
values that were returned by ValueAtTime, i.e. we invested work to
retrieve up to two values, and then we invested more work to throw one
of them away.
The SeriesIterator.BoundaryValues method can be removed once #1401 is
fixed. But I really didn't want to load even more changes into this
PR.
Benchmarks:
The BenchmarkFuzz.* benchmarks run 83% faster (i.e. about six times
faster) and allocate 95% fewer bytes. The reason for that is that the
benchmark reads one sample after another from the time series and
creates a new series iterator for each sample read.
To find out how much these improvements matter in practice, I have
mirrored a beefy Prometheus server at SoundCloud that suffers from
both issues #1035 and #1264. To reach steady state that would be
comparable, the server needs to run for 15d. So far, it has run for
1d. The test server currently has only half as many memory time series
and 60% of the memory chunks the main server has. The 90th percentile
rule evaluation cycle time is ~11s on the main server and only ~3s on
the test server. However, these numbers might get much closer over
time.
In addition to performance improvements, this commit removes about 150
LOC.
2016-02-16 09:47:50 -08:00
if sample . Value != found . Value || sample . Timestamp != found . Timestamp {
2014-10-28 11:01:41 -07:00
t . Errorf (
2016-03-20 09:14:47 -07:00
"Sample #%d %v: Value (or timestamp) mismatch, want %f (at time %v), got %f (at time %v)." ,
2016-03-12 12:34:51 -08:00
i , sample , sample . Value , sample . Timestamp , found . Value , found . Timestamp ,
2014-10-28 11:01:41 -07:00
)
2014-08-14 09:23:49 -07:00
result = false
}
}
return result
}
2015-07-13 12:12:27 -07:00
func TestAppendOutOfOrder ( t * testing . T ) {
2016-03-20 15:32:20 -07:00
s , closer := NewTestStorage ( t , 2 )
2015-07-13 12:12:27 -07:00
defer closer . Close ( )
2015-08-20 08:18:46 -07:00
m := model . Metric {
model . MetricNameLabel : "out_of_order" ,
2015-07-13 12:12:27 -07:00
}
2016-05-20 04:46:33 -07:00
tests := [ ] struct {
name string
timestamp model . Time
value model . SampleValue
wantErr error
} {
{
name : "1st sample" ,
timestamp : 0 ,
value : 0 ,
wantErr : nil ,
} ,
{
name : "regular append" ,
timestamp : 2 ,
value : 1 ,
wantErr : nil ,
} ,
{
name : "same timestamp, same value (no-op)" ,
timestamp : 2 ,
value : 1 ,
wantErr : nil ,
} ,
{
name : "same timestamp, different value" ,
timestamp : 2 ,
value : 2 ,
wantErr : ErrDuplicateSampleForTimestamp ,
} ,
{
name : "earlier timestamp, same value" ,
timestamp : 1 ,
value : 2 ,
wantErr : ErrOutOfOrderSample ,
} ,
{
name : "earlier timestamp, different value" ,
timestamp : 1 ,
value : 3 ,
wantErr : ErrOutOfOrderSample ,
} ,
{
name : "regular append of NaN" ,
timestamp : 3 ,
value : model . SampleValue ( math . NaN ( ) ) ,
wantErr : nil ,
} ,
{
name : "no-op append of NaN" ,
timestamp : 3 ,
value : model . SampleValue ( math . NaN ( ) ) ,
wantErr : nil ,
} ,
{
name : "append of NaN with earlier timestamp" ,
timestamp : 2 ,
value : model . SampleValue ( math . NaN ( ) ) ,
wantErr : ErrOutOfOrderSample ,
} ,
{
name : "append of normal sample after NaN with same timestamp" ,
timestamp : 3 ,
value : 3.14 ,
wantErr : ErrDuplicateSampleForTimestamp ,
} ,
}
for _ , test := range tests {
gotErr := s . Append ( & model . Sample {
2015-07-13 12:12:27 -07:00
Metric : m ,
2016-05-20 04:46:33 -07:00
Timestamp : test . timestamp ,
Value : test . value ,
2015-07-13 12:12:27 -07:00
} )
2016-05-20 04:46:33 -07:00
if gotErr != test . wantErr {
t . Errorf ( "%s: got %q, want %q" , test . name , gotErr , test . wantErr )
}
2015-07-13 12:12:27 -07:00
}
Checkpoint fingerprint mappings only upon shutdown
Before, we checkpointed after every newly detected fingerprint
collision, which is not a problem as long as collisions are
rare. However, with a sufficient number of metrics or particular
nature of the data set, there might be a lot of collisions, all to be
detected upon the first set of scrapes, and then the checkpointing
after each detection will take a quite long time (it's O(n²),
essentially).
Since we are rebuilding the fingerprint mapping during crash recovery,
the previous, very conservative approach didn't even buy us
anything. We only ever read from the checkpoint file after a clean
shutdown, so the only time we need to write the checkpoint file is
during a clean shutdown.
2016-04-14 07:02:37 -07:00
fp := s . mapper . mapFP ( m . FastFingerprint ( ) , m )
2015-07-13 12:12:27 -07:00
2016-09-18 04:20:46 -07:00
it := s . preloadChunksForRange ( makeFingerprintSeriesPair ( s , fp ) , 0 , 2 )
2016-07-11 11:27:25 -07:00
defer it . Close ( )
2015-07-13 12:12:27 -07:00
2015-08-22 05:52:35 -07:00
want := [ ] model . SamplePair {
2015-07-13 12:12:27 -07:00
{
Timestamp : 0 ,
Value : 0 ,
} ,
{
Timestamp : 2 ,
2015-07-16 03:48:33 -07:00
Value : 1 ,
2015-07-13 12:12:27 -07:00
} ,
2016-05-20 04:46:33 -07:00
{
Timestamp : 3 ,
Value : model . SampleValue ( math . NaN ( ) ) ,
} ,
2015-07-13 12:12:27 -07:00
}
2016-05-20 04:46:33 -07:00
got := it . RangeValues ( metric . Interval { OldestInclusive : 0 , NewestInclusive : 3 } )
// Note that we cannot just reflect.DeepEqual(want, got) because it has
// the semantics of NaN != NaN.
for i , gotSamplePair := range got {
wantSamplePair := want [ i ]
if ! wantSamplePair . Equal ( & gotSamplePair ) {
t . Fatalf ( "want %v, got %v" , wantSamplePair , gotSamplePair )
}
2015-07-13 12:12:27 -07:00
}
}
storage: Evict chunks and calculate persistence pressure based on target heap size
This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".
The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).
This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.
Answers to anticipated comments:
There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.
One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.
2017-03-01 06:17:31 -08:00
func TestCalculatePersistUrgency ( t * testing . T ) {
tests := map [ string ] struct {
persistUrgency int32
lenEvictList int
numChunksToPersist int64
targetHeapSize , msNextGC , msHeapAlloc uint64
msNumGC , lastNumGC uint32
wantPersistUrgency int32
wantChunksToEvict int
wantLastNumGC uint32
} {
"all zeros" : {
persistUrgency : 0 ,
lenEvictList : 0 ,
numChunksToPersist : 0 ,
targetHeapSize : 0 ,
msNextGC : 0 ,
msHeapAlloc : 0 ,
msNumGC : 0 ,
lastNumGC : 0 ,
wantPersistUrgency : 0 ,
wantChunksToEvict : 0 ,
wantLastNumGC : 0 ,
} ,
"far from target heap size, plenty of chunks to persist, GC has happened" : {
persistUrgency : 500 ,
lenEvictList : 1000 ,
numChunksToPersist : 100 ,
targetHeapSize : 1000000 ,
msNextGC : 500000 ,
msHeapAlloc : 400000 ,
msNumGC : 42 ,
lastNumGC : 41 ,
wantPersistUrgency : 45 ,
wantChunksToEvict : 0 ,
wantLastNumGC : 42 ,
} ,
"far from target heap size, plenty of chunks to persist, GC hasn't happened, urgency must not decrease" : {
persistUrgency : 500 ,
lenEvictList : 1000 ,
numChunksToPersist : 100 ,
targetHeapSize : 1000000 ,
msNextGC : 500000 ,
msHeapAlloc : 400000 ,
msNumGC : 42 ,
lastNumGC : 42 ,
wantPersistUrgency : 500 ,
wantChunksToEvict : 0 ,
wantLastNumGC : 42 ,
} ,
"far from target heap size but no chunks to persist" : {
persistUrgency : 50 ,
lenEvictList : 0 ,
numChunksToPersist : 100 ,
targetHeapSize : 1000000 ,
msNextGC : 500000 ,
msHeapAlloc : 400000 ,
msNumGC : 42 ,
lastNumGC : 41 ,
wantPersistUrgency : 500 ,
wantChunksToEvict : 0 ,
wantLastNumGC : 42 ,
} ,
"far from target heap size but no chunks to persist, HeapAlloc > NextGC" : {
persistUrgency : 50 ,
lenEvictList : 0 ,
numChunksToPersist : 100 ,
targetHeapSize : 1000000 ,
msNextGC : 500000 ,
msHeapAlloc : 600000 ,
msNumGC : 42 ,
lastNumGC : 41 ,
wantPersistUrgency : 600 ,
wantChunksToEvict : 0 ,
wantLastNumGC : 42 ,
} ,
"target heap size exceeded but GC hasn't happened" : {
persistUrgency : 50 ,
lenEvictList : 3000 ,
numChunksToPersist : 1000 ,
targetHeapSize : 1000000 ,
msNextGC : 1100000 ,
msHeapAlloc : 900000 ,
msNumGC : 42 ,
lastNumGC : 42 ,
wantPersistUrgency : 275 ,
wantChunksToEvict : 0 ,
wantLastNumGC : 42 ,
} ,
"target heap size exceeded, GC has happened" : {
persistUrgency : 50 ,
lenEvictList : 3000 ,
numChunksToPersist : 1000 ,
targetHeapSize : 1000000 ,
msNextGC : 1100000 ,
msHeapAlloc : 900000 ,
msNumGC : 42 ,
lastNumGC : 41 ,
wantPersistUrgency : 275 ,
wantChunksToEvict : 97 ,
wantLastNumGC : 42 ,
} ,
"target heap size exceeded, GC has happened, urgency bumped due to low number of evictable chunks" : {
persistUrgency : 50 ,
lenEvictList : 300 ,
numChunksToPersist : 100 ,
targetHeapSize : 1000000 ,
msNextGC : 1100000 ,
msHeapAlloc : 900000 ,
msNumGC : 42 ,
lastNumGC : 41 ,
wantPersistUrgency : 323 ,
wantChunksToEvict : 97 ,
wantLastNumGC : 42 ,
} ,
"target heap size exceeded but no evictable chunks and GC hasn't happened" : {
persistUrgency : 50 ,
lenEvictList : 0 ,
numChunksToPersist : 1000 ,
targetHeapSize : 1000000 ,
msNextGC : 1100000 ,
msHeapAlloc : 900000 ,
msNumGC : 42 ,
lastNumGC : 42 ,
wantPersistUrgency : 1000 ,
wantChunksToEvict : 0 ,
wantLastNumGC : 42 ,
} ,
"target heap size exceeded but no evictable chunks and GC has happened" : {
persistUrgency : 50 ,
lenEvictList : 0 ,
numChunksToPersist : 1000 ,
targetHeapSize : 1000000 ,
msNextGC : 1100000 ,
msHeapAlloc : 900000 ,
msNumGC : 42 ,
lastNumGC : 41 ,
wantPersistUrgency : 1000 ,
wantChunksToEvict : 0 ,
wantLastNumGC : 42 ,
} ,
"target heap size exceeded, very few evictable chunks, GC hasn't happened" : {
persistUrgency : 50 ,
lenEvictList : 10 ,
numChunksToPersist : 1000 ,
targetHeapSize : 1000000 ,
msNextGC : 1100000 ,
msHeapAlloc : 900000 ,
msNumGC : 42 ,
lastNumGC : 42 ,
wantPersistUrgency : 1000 ,
wantChunksToEvict : 0 ,
wantLastNumGC : 42 ,
} ,
"target heap size exceeded, some evictable chunks (but not enough), GC hasn't happened" : {
persistUrgency : 50 ,
lenEvictList : 50 ,
numChunksToPersist : 250 ,
targetHeapSize : 1000000 ,
msNextGC : 1100000 ,
msHeapAlloc : 900000 ,
msNumGC : 42 ,
lastNumGC : 42 ,
wantPersistUrgency : 916 ,
wantChunksToEvict : 0 ,
wantLastNumGC : 42 ,
} ,
"target heap size exceeded, some evictable chunks (but not enough), GC has happened" : {
persistUrgency : 50 ,
lenEvictList : 50 ,
numChunksToPersist : 250 ,
targetHeapSize : 1000000 ,
msNextGC : 1100000 ,
msHeapAlloc : 900000 ,
msNumGC : 42 ,
lastNumGC : 41 ,
wantPersistUrgency : 1000 ,
wantChunksToEvict : 50 ,
wantLastNumGC : 42 ,
} ,
}
s , closer := NewTestStorage ( t , 1 )
defer closer . Close ( )
for scenario , test := range tests {
s . persistUrgency = test . persistUrgency
s . numChunksToPersist = test . numChunksToPersist
s . targetHeapSize = test . targetHeapSize
s . lastNumGC = test . lastNumGC
s . evictList . Init ( )
for i := 0 ; i < test . lenEvictList ; i ++ {
s . evictList . PushBack ( & struct { } { } )
}
ms := runtime . MemStats {
NextGC : test . msNextGC ,
HeapAlloc : test . msHeapAlloc ,
NumGC : test . msNumGC ,
}
chunksToEvict := s . calculatePersistUrgency ( & ms )
if chunksToEvict != test . wantChunksToEvict {
t . Errorf (
"scenario %q: got %d chunks to evict, want %d" ,
scenario , chunksToEvict , test . wantChunksToEvict ,
)
}
if s . persistUrgency != test . wantPersistUrgency {
t . Errorf (
"scenario %q: got persist urgency %d, want %d" ,
scenario , s . persistUrgency , test . wantPersistUrgency ,
)
}
if s . lastNumGC != test . wantLastNumGC {
t . Errorf (
"scenario %q: got lastNumGC %d , want %d" ,
scenario , s . lastNumGC , test . wantLastNumGC ,
)
}
}
}