2015-01-21 11:07:45 -08:00
|
|
|
// Copyright 2014 The Prometheus Authors
|
2014-09-19 09:18:44 -07:00
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
2014-11-27 09:04:48 -08:00
|
|
|
// NOTE ON FILENAME: Do not rename this file helpers_test.go (which might appear
|
|
|
|
// an obvious choice). We need NewTestStorage in tests outside of the local
|
|
|
|
// package, too. On the other hand, moving NewTestStorage in its own package
|
|
|
|
// would cause circular dependencies in the tests in packages local.
|
|
|
|
|
2014-09-16 06:47:24 -07:00
|
|
|
package local
|
2014-06-06 02:55:53 -07:00
|
|
|
|
|
|
|
import (
|
|
|
|
"time"
|
|
|
|
|
2016-03-09 12:56:15 -08:00
|
|
|
"github.com/prometheus/common/model"
|
2016-09-21 14:44:27 -07:00
|
|
|
"github.com/prometheus/prometheus/storage/local/chunk"
|
2015-05-29 04:30:30 -07:00
|
|
|
"github.com/prometheus/prometheus/util/testutil"
|
2014-06-06 02:55:53 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
type testStorageCloser struct {
|
|
|
|
storage Storage
|
2015-05-28 11:58:38 -07:00
|
|
|
directory testutil.Closer
|
2014-06-06 02:55:53 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
func (t *testStorageCloser) Close() {
|
2015-09-12 16:06:40 -07:00
|
|
|
if err := t.storage.Stop(); err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
2014-06-06 02:55:53 -07:00
|
|
|
t.directory.Close()
|
|
|
|
}
|
|
|
|
|
2014-09-16 06:47:24 -07:00
|
|
|
// NewTestStorage creates a storage instance backed by files in a temporary
|
|
|
|
// directory. The returned storage is already in serving state. Upon closing the
|
|
|
|
// returned test.Closer, the temporary directory is cleaned up.
|
2016-09-21 14:44:27 -07:00
|
|
|
func NewTestStorage(t testutil.T, encoding chunk.Encoding) (*MemorySeriesStorage, testutil.Closer) {
|
|
|
|
chunk.DefaultEncoding = encoding
|
2015-05-28 11:58:38 -07:00
|
|
|
directory := testutil.NewTemporaryDirectory("test_storage", t)
|
2014-06-06 02:55:53 -07:00
|
|
|
o := &MemorySeriesStorageOptions{
|
storage: Evict chunks and calculate persistence pressure based on target heap size
This is a fairly easy attempt to dynamically evict chunks based on the
heap size. A target heap size has to be set as a command line flage,
so that users can essentially say "utilize 4GiB of RAM, and please
don't OOM".
The -storage.local.max-chunks-to-persist and
-storage.local.memory-chunks flags are deprecated by this
change. Backwards compatibility is provided by ignoring
-storage.local.max-chunks-to-persist and use
-storage.local.memory-chunks to set the new
-storage.local.target-heap-size to a reasonable (and conservative)
value (both with a warning).
This also makes the metrics intstrumentation more consistent (in
naming and implementation) and cleans up a few quirks in the tests.
Answers to anticipated comments:
There is a chance that Go 1.9 will allow programs better control over
the Go memory management. I don't expect those changes to be in
contradiction with the approach here, but I do expect them to
complement them and allow them to be more precise and controlled. In
any case, once those Go changes are available, this code has to be
revisted.
One might be tempted to let the user specify an estimated value for
the RSS usage, and then internall set a target heap size of a certain
fraction of that. (In my experience, 2/3 is a fairly safe bet.)
However, investigations have shown that RSS size and its relation to
the heap size is really really complicated. It depends on so many
factors that I wouldn't even start listing them in a commit
description. It depends on many circumstances and not at least on the
risk trade-off of each individual user between RAM utilization and
probability of OOMing during a RAM usage peak. To not add even more to
the confusion, we need to stick to the well-defined number we also use
in the targeting here, the sum of the sizes of heap objects.
2017-03-01 06:17:31 -08:00
|
|
|
TargetHeapSize: 1000000000,
|
Improve persisting chunks to disk.
This is done by bucketing chunks by fingerprint. If the persisting to
disk falls behind, more and more chunks are in the queue. As soon as
there are "double hits", we will now persist both chunks in one go,
doubling the disk throughput (assuming it is limited by disk
seeks). Should even more pile up so that we end wit "triple hits", we
will persist those first, and so on.
Even if we have millions of time series, this will still help,
assuming not all of them are growing with the same speed. Series that
get many samples and/or are not very compressable will accumulate
chunks faster, and they will soon get double- or triple-writes.
To improve the chance of double writes,
-storage.local.persistence-queue-capacity could be set to a higher
value. However, that will slow down shutdown a lot (as the queue has
to be worked through). So we leave it to the user to set it to a
really high value. A more fundamental solution would be to checkpoint
not only head chunks, but also chunks still in the persist queue. That
would be quite complicated for a rather limited use-case (running many
time series with high ingestion rate on slow spinning disks).
2015-02-13 11:08:52 -08:00
|
|
|
PersistenceRetentionPeriod: 24 * time.Hour * 365 * 100, // Enough to never trigger purging.
|
2014-10-07 10:11:24 -07:00
|
|
|
PersistenceStoragePath: directory.Path(),
|
storage: Use staleness delta as head chunk timeout
Currently, if a series stops to exist, its head chunk will be kept
open for an hour. That prevents it from being persisted. Which
prevents it from being evicted. Which prevents the series from being
archived.
Most of the time, once no sample has been added to a series within the
staleness limit, we can be pretty confident that this series will not
receive samples anymore. The whole chain as described above can be
started after 5m instead of 1h. In the relaxed case, this doesn't
change a lot as the head chunk timeout is only checked during series
maintenance, and usually, a series is only maintained every six
hours. However, there is the typical scenario where a large service is
deployed, the deoply turns out to be bad, and then it is deployed
again within minutes, and quite quickly the number of time series has
tripled. That's the point where the Prometheus server is stressed and
switches (rightfully) into rushed mode. In that mode, time series are
processed as quickly as possible, but all of that is in vein if all of
those recently ended time series cannot be persisted yet for another
hour. In that scenario, this change will help most, and it's exactly
the scenario where help is most desperately needed.
2017-03-26 14:44:50 -07:00
|
|
|
HeadChunkTimeout: 5 * time.Minute,
|
2014-10-24 11:27:27 -07:00
|
|
|
CheckpointInterval: time.Hour,
|
2015-03-19 07:41:50 -07:00
|
|
|
SyncStrategy: Adaptive,
|
2014-06-06 02:55:53 -07:00
|
|
|
}
|
2015-05-18 10:26:28 -07:00
|
|
|
storage := NewMemorySeriesStorage(o)
|
2016-06-23 04:03:41 -07:00
|
|
|
storage.archiveHighWatermark = model.Latest
|
2015-05-18 10:26:28 -07:00
|
|
|
if err := storage.Start(); err != nil {
|
2014-06-06 02:55:53 -07:00
|
|
|
directory.Close()
|
|
|
|
t.Fatalf("Error creating storage: %s", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
closer := &testStorageCloser{
|
|
|
|
storage: storage,
|
|
|
|
directory: directory,
|
|
|
|
}
|
|
|
|
|
2016-06-23 04:03:41 -07:00
|
|
|
return storage, closer
|
2014-06-06 02:55:53 -07:00
|
|
|
}
|
2016-09-18 04:20:46 -07:00
|
|
|
|
|
|
|
func makeFingerprintSeriesPair(s *MemorySeriesStorage, fp model.Fingerprint) fingerprintSeriesPair {
|
|
|
|
return fingerprintSeriesPair{fp, s.seriesForRange(fp, model.Earliest, model.Latest)}
|
|
|
|
}
|