mirror of
https://github.com/prometheus/prometheus.git
synced 2025-03-05 20:59:13 -08:00
Merge branch 'master' into bootstrap4
This commit is contained in:
commit
cd569b51d9
|
@ -1,5 +1,3 @@
|
||||||
sudo: false
|
|
||||||
|
|
||||||
language: go
|
language: go
|
||||||
|
|
||||||
# Whenever the Go version is updated here, .circleci/config.yml and .promu.yml
|
# Whenever the Go version is updated here, .circleci/config.yml and .promu.yml
|
||||||
|
|
|
@ -36,7 +36,8 @@ GO_VERSION ?= $(shell $(GO) version)
|
||||||
GO_VERSION_NUMBER ?= $(word 3, $(GO_VERSION))
|
GO_VERSION_NUMBER ?= $(word 3, $(GO_VERSION))
|
||||||
PRE_GO_111 ?= $(shell echo $(GO_VERSION_NUMBER) | grep -E 'go1\.(10|[0-9])\.')
|
PRE_GO_111 ?= $(shell echo $(GO_VERSION_NUMBER) | grep -E 'go1\.(10|[0-9])\.')
|
||||||
|
|
||||||
unexport GOVENDOR
|
GOVENDOR :=
|
||||||
|
GO111MODULE :=
|
||||||
ifeq (, $(PRE_GO_111))
|
ifeq (, $(PRE_GO_111))
|
||||||
ifneq (,$(wildcard go.mod))
|
ifneq (,$(wildcard go.mod))
|
||||||
# Enforce Go modules support just in case the directory is inside GOPATH (and for Travis CI).
|
# Enforce Go modules support just in case the directory is inside GOPATH (and for Travis CI).
|
||||||
|
@ -57,8 +58,6 @@ $(warning Some recipes may not work as expected as the current Go runtime is '$(
|
||||||
# This repository isn't using Go modules (yet).
|
# This repository isn't using Go modules (yet).
|
||||||
GOVENDOR := $(FIRST_GOPATH)/bin/govendor
|
GOVENDOR := $(FIRST_GOPATH)/bin/govendor
|
||||||
endif
|
endif
|
||||||
|
|
||||||
unexport GO111MODULE
|
|
||||||
endif
|
endif
|
||||||
PROMU := $(FIRST_GOPATH)/bin/promu
|
PROMU := $(FIRST_GOPATH)/bin/promu
|
||||||
STATICCHECK := $(FIRST_GOPATH)/bin/staticcheck
|
STATICCHECK := $(FIRST_GOPATH)/bin/staticcheck
|
||||||
|
|
|
@ -12,8 +12,9 @@ Release cadence of first pre-releases being cut is 6 weeks.
|
||||||
| v2.5 | 2018-10-24 | Frederic Branczyk (GitHub: @brancz) |
|
| v2.5 | 2018-10-24 | Frederic Branczyk (GitHub: @brancz) |
|
||||||
| v2.6 | 2018-12-05 | Simon Pasquier (GitHub: @simonpasquier) |
|
| v2.6 | 2018-12-05 | Simon Pasquier (GitHub: @simonpasquier) |
|
||||||
| v2.7 | 2019-01-16 | Goutham Veeramachaneni (GitHub: @gouthamve) |
|
| v2.7 | 2019-01-16 | Goutham Veeramachaneni (GitHub: @gouthamve) |
|
||||||
| v2.8 | 2019-02-27 | **searching for volunteer** |
|
| v2.8 | 2019-02-27 | Ganesh Vernekar (GitHub: @codesome) |
|
||||||
| v2.9 | 2019-04-10 | **searching for volunteer** |
|
| v2.9 | 2019-04-10 | **searching for volunteer** |
|
||||||
|
| v2.10 | 2019-05-22 | **searching for volunteer** |
|
||||||
|
|
||||||
If you are interested in volunteering please create a pull request against the [prometheus/prometheus](https://github.com/prometheus/prometheus) repository and propose yourself for the release series of your choice.
|
If you are interested in volunteering please create a pull request against the [prometheus/prometheus](https://github.com/prometheus/prometheus) repository and propose yourself for the release series of your choice.
|
||||||
|
|
||||||
|
|
|
@ -307,7 +307,7 @@ func main() {
|
||||||
|
|
||||||
var (
|
var (
|
||||||
localStorage = &tsdb.ReadyStorage{}
|
localStorage = &tsdb.ReadyStorage{}
|
||||||
remoteStorage = remote.NewStorage(log.With(logger, "component", "remote"), localStorage.StartTime, time.Duration(cfg.RemoteFlushDeadline))
|
remoteStorage = remote.NewStorage(log.With(logger, "component", "remote"), prometheus.DefaultRegisterer, localStorage.StartTime, cfg.localStoragePath, time.Duration(cfg.RemoteFlushDeadline))
|
||||||
fanoutStorage = storage.NewFanout(logger, localStorage, remoteStorage)
|
fanoutStorage = storage.NewFanout(logger, localStorage, remoteStorage)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -107,9 +107,10 @@ var (
|
||||||
MinShards: 1,
|
MinShards: 1,
|
||||||
MaxSamplesPerSend: 100,
|
MaxSamplesPerSend: 100,
|
||||||
|
|
||||||
// By default, buffer 100 batches, which at 100ms per batch is 10s. At
|
// Each shard will have a max of 10 samples pending in it's channel, plus the pending
|
||||||
// 1000 shards, this will buffer 10M samples total.
|
// samples that have been enqueued. Theoretically we should only ever have about 110 samples
|
||||||
Capacity: 100 * 100,
|
// per shard pending. At 1000 shards that's 110k.
|
||||||
|
Capacity: 10,
|
||||||
BatchSendDeadline: model.Duration(5 * time.Second),
|
BatchSendDeadline: model.Duration(5 * time.Second),
|
||||||
|
|
||||||
// Max number of times to retry a batch on recoverable errors.
|
// Max number of times to retry a batch on recoverable errors.
|
||||||
|
|
|
@ -215,7 +215,7 @@ func (d *Discovery) refresh() (tg *targetgroup.Group, err error) {
|
||||||
dr := DiscoveryResponse{}
|
dr := DiscoveryResponse{}
|
||||||
err = json.Unmarshal(data, &dr)
|
err = json.Unmarshal(data, &dr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return tg, fmt.Errorf("an error occurred unmarshaling the disovery response json. %s", err)
|
return tg, fmt.Errorf("an error occurred unmarshaling the discovery response json. %s", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, container := range dr.Containers {
|
for _, container := range dr.Containers {
|
||||||
|
|
|
@ -81,6 +81,25 @@ func (ls *Labels) UnmarshalJSON(b []byte) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MatchLabels returns a subset of Labels that matches/does not match with the provided label names based on the 'on' boolean.
|
||||||
|
// If on is set to true, it returns the subset of labels that match with the provided label names and its inverse when 'on' is set to false.
|
||||||
|
func (ls Labels) MatchLabels(on bool, names ...string) Labels {
|
||||||
|
matchedLabels := Labels{}
|
||||||
|
|
||||||
|
nameSet := map[string]struct{}{}
|
||||||
|
for _, n := range names {
|
||||||
|
nameSet[n] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v := range ls {
|
||||||
|
if _, ok := nameSet[v.Name]; on == ok {
|
||||||
|
matchedLabels = append(matchedLabels, v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return matchedLabels
|
||||||
|
}
|
||||||
|
|
||||||
// Hash returns a hash value for the label set.
|
// Hash returns a hash value for the label set.
|
||||||
func (ls Labels) Hash() uint64 {
|
func (ls Labels) Hash() uint64 {
|
||||||
b := make([]byte, 0, 1024)
|
b := make([]byte, 0, 1024)
|
||||||
|
|
104
pkg/labels/labels_test.go
Normal file
104
pkg/labels/labels_test.go
Normal file
|
@ -0,0 +1,104 @@
|
||||||
|
// Copyright 2019 The Prometheus Authors
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package labels
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestLabels_MatchLabels(t *testing.T) {
|
||||||
|
labels := Labels{
|
||||||
|
{
|
||||||
|
Name: "__name__",
|
||||||
|
Value: "ALERTS",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "alertname",
|
||||||
|
Value: "HTTPRequestRateLow",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "alertstate",
|
||||||
|
Value: "pending",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "instance",
|
||||||
|
Value: "0",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "job",
|
||||||
|
Value: "app-server",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "severity",
|
||||||
|
Value: "critical",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
providedNames := []string{
|
||||||
|
"__name__",
|
||||||
|
"alertname",
|
||||||
|
"alertstate",
|
||||||
|
"instance",
|
||||||
|
}
|
||||||
|
|
||||||
|
got := labels.MatchLabels(true, providedNames...)
|
||||||
|
expected := Labels{
|
||||||
|
{
|
||||||
|
Name: "__name__",
|
||||||
|
Value: "ALERTS",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "alertname",
|
||||||
|
Value: "HTTPRequestRateLow",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "alertstate",
|
||||||
|
Value: "pending",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "instance",
|
||||||
|
Value: "0",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
assertSlice(t, got, expected)
|
||||||
|
|
||||||
|
// Now try with 'on' set to false.
|
||||||
|
got = labels.MatchLabels(false, providedNames...)
|
||||||
|
|
||||||
|
expected = Labels{
|
||||||
|
{
|
||||||
|
Name: "job",
|
||||||
|
Value: "app-server",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "severity",
|
||||||
|
Value: "critical",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
assertSlice(t, got, expected)
|
||||||
|
}
|
||||||
|
|
||||||
|
func assertSlice(t *testing.T, got, expected Labels) {
|
||||||
|
if len(expected) != len(got) {
|
||||||
|
t.Errorf("expected the length of matched label names to be %d, but got %d", len(expected), len(got))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, expectedLabel := range expected {
|
||||||
|
if expectedLabel.Name != got[i].Name {
|
||||||
|
t.Errorf("expected to get Label with name %s, but got %s instead", expectedLabel.Name, got[i].Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1434,9 +1434,16 @@ func (ev *evaluator) VectorBinop(op ItemType, lhs, rhs Vector, matching *VectorM
|
||||||
sig := sigf(rs.Metric)
|
sig := sigf(rs.Metric)
|
||||||
// The rhs is guaranteed to be the 'one' side. Having multiple samples
|
// The rhs is guaranteed to be the 'one' side. Having multiple samples
|
||||||
// with the same signature means that the matching is many-to-many.
|
// with the same signature means that the matching is many-to-many.
|
||||||
if _, found := rightSigs[sig]; found {
|
if duplSample, found := rightSigs[sig]; found {
|
||||||
|
// oneSide represents which side of the vector represents the 'one' in the many-to-one relationship.
|
||||||
|
oneSide := "right"
|
||||||
|
if matching.Card == CardOneToMany {
|
||||||
|
oneSide = "left"
|
||||||
|
}
|
||||||
|
matchedLabels := rs.Metric.MatchLabels(matching.On, matching.MatchingLabels...)
|
||||||
// Many-to-many matching not allowed.
|
// Many-to-many matching not allowed.
|
||||||
ev.errorf("many-to-many matching not allowed: matching labels must be unique on one side")
|
ev.errorf("found duplicate series for the match group %s on the %s hand-side of the operation: [%s, %s]"+
|
||||||
|
";many-to-many matching not allowed: matching labels must be unique on one side", matchedLabels.String(), oneSide, rs.Metric.String(), duplSample.Metric.String())
|
||||||
}
|
}
|
||||||
rightSigs[sig] = rs
|
rightSigs[sig] = rs
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
package scrape
|
package scrape
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"reflect"
|
"reflect"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
@ -104,18 +105,18 @@ func (m *Manager) reload() {
|
||||||
m.mtxScrape.Lock()
|
m.mtxScrape.Lock()
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
for setName, groups := range m.targetSets {
|
for setName, groups := range m.targetSets {
|
||||||
var sp *scrapePool
|
if _, ok := m.scrapePools[setName]; !ok {
|
||||||
existing, ok := m.scrapePools[setName]
|
|
||||||
if !ok {
|
|
||||||
scrapeConfig, ok := m.scrapeConfigs[setName]
|
scrapeConfig, ok := m.scrapeConfigs[setName]
|
||||||
if !ok {
|
if !ok {
|
||||||
level.Error(m.logger).Log("msg", "error reloading target set", "err", "invalid config id:"+setName)
|
level.Error(m.logger).Log("msg", "error reloading target set", "err", "invalid config id:"+setName)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
sp = newScrapePool(scrapeConfig, m.append, log.With(m.logger, "scrape_pool", setName))
|
sp, err := newScrapePool(scrapeConfig, m.append, log.With(m.logger, "scrape_pool", setName))
|
||||||
|
if err != nil {
|
||||||
|
level.Error(m.logger).Log("msg", "error creating new scrape pool", "err", err, "scrape_pool", setName)
|
||||||
|
continue
|
||||||
|
}
|
||||||
m.scrapePools[setName] = sp
|
m.scrapePools[setName] = sp
|
||||||
} else {
|
|
||||||
sp = existing
|
|
||||||
}
|
}
|
||||||
|
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
|
@ -123,7 +124,7 @@ func (m *Manager) reload() {
|
||||||
go func(sp *scrapePool, groups []*targetgroup.Group) {
|
go func(sp *scrapePool, groups []*targetgroup.Group) {
|
||||||
sp.Sync(groups)
|
sp.Sync(groups)
|
||||||
wg.Done()
|
wg.Done()
|
||||||
}(sp, groups)
|
}(m.scrapePools[setName], groups)
|
||||||
|
|
||||||
}
|
}
|
||||||
m.mtxScrape.Unlock()
|
m.mtxScrape.Unlock()
|
||||||
|
@ -158,16 +159,24 @@ func (m *Manager) ApplyConfig(cfg *config.Config) error {
|
||||||
}
|
}
|
||||||
m.scrapeConfigs = c
|
m.scrapeConfigs = c
|
||||||
|
|
||||||
// Cleanup and reload pool if config has changed.
|
// Cleanup and reload pool if the configuration has changed.
|
||||||
|
var failed bool
|
||||||
for name, sp := range m.scrapePools {
|
for name, sp := range m.scrapePools {
|
||||||
if cfg, ok := m.scrapeConfigs[name]; !ok {
|
if cfg, ok := m.scrapeConfigs[name]; !ok {
|
||||||
sp.stop()
|
sp.stop()
|
||||||
delete(m.scrapePools, name)
|
delete(m.scrapePools, name)
|
||||||
} else if !reflect.DeepEqual(sp.config, cfg) {
|
} else if !reflect.DeepEqual(sp.config, cfg) {
|
||||||
sp.reload(cfg)
|
err := sp.reload(cfg)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(m.logger).Log("msg", "error reloading scrape pool", "err", err, "scrape_pool", name)
|
||||||
|
failed = true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if failed {
|
||||||
|
return fmt.Errorf("failed to apply the new configuration")
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -222,47 +222,115 @@ func TestPopulateLabels(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestScrapeManagerReloadNoChange tests that no scrape reload happens when there is no config change.
|
func loadConfiguration(t *testing.T, c string) *config.Config {
|
||||||
func TestManagerReloadNoChange(t *testing.T) {
|
t.Helper()
|
||||||
tsetName := "test"
|
|
||||||
|
|
||||||
cfgText := `
|
cfg := &config.Config{}
|
||||||
|
if err := yaml.UnmarshalStrict([]byte(c), cfg); err != nil {
|
||||||
|
t.Fatalf("Unable to load YAML config: %s", err)
|
||||||
|
}
|
||||||
|
return cfg
|
||||||
|
}
|
||||||
|
|
||||||
|
func noopLoop() loop {
|
||||||
|
return &testLoop{
|
||||||
|
startFunc: func(interval, timeout time.Duration, errc chan<- error) {},
|
||||||
|
stopFunc: func() {},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestManagerApplyConfig(t *testing.T) {
|
||||||
|
// Valid initial configuration.
|
||||||
|
cfgText1 := `
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
- job_name: '` + tsetName + `'
|
- job_name: job1
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ["foo:9090"]
|
- targets: ["foo:9090"]
|
||||||
- targets: ["bar:9090"]
|
|
||||||
`
|
`
|
||||||
cfg := &config.Config{}
|
// Invalid configuration.
|
||||||
if err := yaml.UnmarshalStrict([]byte(cfgText), cfg); err != nil {
|
cfgText2 := `
|
||||||
t.Fatalf("Unable to load YAML config cfgYaml: %s", err)
|
scrape_configs:
|
||||||
}
|
- job_name: job1
|
||||||
|
scheme: https
|
||||||
|
static_configs:
|
||||||
|
- targets: ["foo:9090"]
|
||||||
|
tls_config:
|
||||||
|
ca_file: /not/existing/ca/file
|
||||||
|
`
|
||||||
|
// Valid configuration.
|
||||||
|
cfgText3 := `
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: job1
|
||||||
|
scheme: https
|
||||||
|
static_configs:
|
||||||
|
- targets: ["foo:9090"]
|
||||||
|
`
|
||||||
|
var (
|
||||||
|
cfg1 = loadConfiguration(t, cfgText1)
|
||||||
|
cfg2 = loadConfiguration(t, cfgText2)
|
||||||
|
cfg3 = loadConfiguration(t, cfgText3)
|
||||||
|
|
||||||
|
ch = make(chan struct{}, 1)
|
||||||
|
)
|
||||||
|
|
||||||
scrapeManager := NewManager(nil, nil)
|
scrapeManager := NewManager(nil, nil)
|
||||||
// Load the current config.
|
|
||||||
scrapeManager.ApplyConfig(cfg)
|
|
||||||
|
|
||||||
// As reload never happens, new loop should never be called.
|
|
||||||
newLoop := func(_ *Target, s scraper, _ int, _ bool, _ []*relabel.Config) loop {
|
newLoop := func(_ *Target, s scraper, _ int, _ bool, _ []*relabel.Config) loop {
|
||||||
t.Fatal("reload happened")
|
ch <- struct{}{}
|
||||||
return nil
|
return noopLoop()
|
||||||
}
|
}
|
||||||
|
|
||||||
sp := &scrapePool{
|
sp := &scrapePool{
|
||||||
appendable: &nopAppendable{},
|
appendable: &nopAppendable{},
|
||||||
activeTargets: map[uint64]*Target{},
|
activeTargets: map[uint64]*Target{},
|
||||||
loops: map[uint64]loop{
|
loops: map[uint64]loop{
|
||||||
1: &testLoop{},
|
1: noopLoop(),
|
||||||
},
|
},
|
||||||
newLoop: newLoop,
|
newLoop: newLoop,
|
||||||
logger: nil,
|
logger: nil,
|
||||||
config: cfg.ScrapeConfigs[0],
|
config: cfg1.ScrapeConfigs[0],
|
||||||
}
|
}
|
||||||
scrapeManager.scrapePools = map[string]*scrapePool{
|
scrapeManager.scrapePools = map[string]*scrapePool{
|
||||||
tsetName: sp,
|
"job1": sp,
|
||||||
}
|
}
|
||||||
|
|
||||||
scrapeManager.ApplyConfig(cfg)
|
// Apply the initial configuration.
|
||||||
|
if err := scrapeManager.ApplyConfig(cfg1); err != nil {
|
||||||
|
t.Fatalf("unable to apply configuration: %s", err)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ch:
|
||||||
|
t.Fatal("reload happened")
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply a configuration for which the reload fails.
|
||||||
|
if err := scrapeManager.ApplyConfig(cfg2); err == nil {
|
||||||
|
t.Fatalf("expecting error but got none")
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ch:
|
||||||
|
t.Fatal("reload happened")
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply a configuration for which the reload succeeds.
|
||||||
|
if err := scrapeManager.ApplyConfig(cfg3); err != nil {
|
||||||
|
t.Fatalf("unable to apply configuration: %s", err)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ch:
|
||||||
|
default:
|
||||||
|
t.Fatal("reload didn't happen")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-applying the same configuration shouldn't trigger a reload.
|
||||||
|
if err := scrapeManager.ApplyConfig(cfg3); err != nil {
|
||||||
|
t.Fatalf("unable to apply configuration: %s", err)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ch:
|
||||||
|
t.Fatal("reload happened")
|
||||||
|
default:
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestManagerTargetsUpdates(t *testing.T) {
|
func TestManagerTargetsUpdates(t *testing.T) {
|
||||||
|
|
|
@ -28,6 +28,7 @@ import (
|
||||||
|
|
||||||
"github.com/go-kit/kit/log"
|
"github.com/go-kit/kit/log"
|
||||||
"github.com/go-kit/kit/log/level"
|
"github.com/go-kit/kit/log/level"
|
||||||
|
"github.com/pkg/errors"
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
config_util "github.com/prometheus/common/config"
|
config_util "github.com/prometheus/common/config"
|
||||||
"github.com/prometheus/common/model"
|
"github.com/prometheus/common/model"
|
||||||
|
@ -61,6 +62,30 @@ var (
|
||||||
},
|
},
|
||||||
[]string{"interval"},
|
[]string{"interval"},
|
||||||
)
|
)
|
||||||
|
targetScrapePools = prometheus.NewCounter(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Name: "prometheus_target_scrape_pools_total",
|
||||||
|
Help: "Total number of scrape pool creation atttempts.",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
targetScrapePoolsFailed = prometheus.NewCounter(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Name: "prometheus_target_scrape_pools_failed_total",
|
||||||
|
Help: "Total number of scrape pool creations that failed.",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
targetScrapePoolReloads = prometheus.NewCounter(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Name: "prometheus_target_scrape_pool_reloads_total",
|
||||||
|
Help: "Total number of scrape loop reloads.",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
targetScrapePoolReloadsFailed = prometheus.NewCounter(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Name: "prometheus_target_scrape_pool_reloads_failed_total",
|
||||||
|
Help: "Total number of failed scrape loop reloads.",
|
||||||
|
},
|
||||||
|
)
|
||||||
targetSyncIntervalLength = prometheus.NewSummaryVec(
|
targetSyncIntervalLength = prometheus.NewSummaryVec(
|
||||||
prometheus.SummaryOpts{
|
prometheus.SummaryOpts{
|
||||||
Name: "prometheus_target_sync_length_seconds",
|
Name: "prometheus_target_sync_length_seconds",
|
||||||
|
@ -105,6 +130,10 @@ var (
|
||||||
func init() {
|
func init() {
|
||||||
prometheus.MustRegister(targetIntervalLength)
|
prometheus.MustRegister(targetIntervalLength)
|
||||||
prometheus.MustRegister(targetReloadIntervalLength)
|
prometheus.MustRegister(targetReloadIntervalLength)
|
||||||
|
prometheus.MustRegister(targetScrapePools)
|
||||||
|
prometheus.MustRegister(targetScrapePoolsFailed)
|
||||||
|
prometheus.MustRegister(targetScrapePoolReloads)
|
||||||
|
prometheus.MustRegister(targetScrapePoolReloadsFailed)
|
||||||
prometheus.MustRegister(targetSyncIntervalLength)
|
prometheus.MustRegister(targetSyncIntervalLength)
|
||||||
prometheus.MustRegister(targetScrapePoolSyncsCounter)
|
prometheus.MustRegister(targetScrapePoolSyncsCounter)
|
||||||
prometheus.MustRegister(targetScrapeSampleLimit)
|
prometheus.MustRegister(targetScrapeSampleLimit)
|
||||||
|
@ -136,15 +165,16 @@ const maxAheadTime = 10 * time.Minute
|
||||||
|
|
||||||
type labelsMutator func(labels.Labels) labels.Labels
|
type labelsMutator func(labels.Labels) labels.Labels
|
||||||
|
|
||||||
func newScrapePool(cfg *config.ScrapeConfig, app Appendable, logger log.Logger) *scrapePool {
|
func newScrapePool(cfg *config.ScrapeConfig, app Appendable, logger log.Logger) (*scrapePool, error) {
|
||||||
|
targetScrapePools.Inc()
|
||||||
if logger == nil {
|
if logger == nil {
|
||||||
logger = log.NewNopLogger()
|
logger = log.NewNopLogger()
|
||||||
}
|
}
|
||||||
|
|
||||||
client, err := config_util.NewClientFromConfig(cfg.HTTPClientConfig, cfg.JobName)
|
client, err := config_util.NewClientFromConfig(cfg.HTTPClientConfig, cfg.JobName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Any errors that could occur here should be caught during config validation.
|
targetScrapePoolsFailed.Inc()
|
||||||
level.Error(logger).Log("msg", "Error creating HTTP client", "err", err)
|
return nil, errors.Wrap(err, "error creating HTTP client")
|
||||||
}
|
}
|
||||||
|
|
||||||
buffers := pool.New(1e3, 100e6, 3, func(sz int) interface{} { return make([]byte, 0, sz) })
|
buffers := pool.New(1e3, 100e6, 3, func(sz int) interface{} { return make([]byte, 0, sz) })
|
||||||
|
@ -182,7 +212,7 @@ func newScrapePool(cfg *config.ScrapeConfig, app Appendable, logger log.Logger)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
return sp
|
return sp, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sp *scrapePool) ActiveTargets() []*Target {
|
func (sp *scrapePool) ActiveTargets() []*Target {
|
||||||
|
@ -227,7 +257,8 @@ func (sp *scrapePool) stop() {
|
||||||
// reload the scrape pool with the given scrape configuration. The target state is preserved
|
// reload the scrape pool with the given scrape configuration. The target state is preserved
|
||||||
// but all scrape loops are restarted with the new scrape configuration.
|
// but all scrape loops are restarted with the new scrape configuration.
|
||||||
// This method returns after all scrape loops that were stopped have stopped scraping.
|
// This method returns after all scrape loops that were stopped have stopped scraping.
|
||||||
func (sp *scrapePool) reload(cfg *config.ScrapeConfig) {
|
func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error {
|
||||||
|
targetScrapePoolReloads.Inc()
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
|
|
||||||
sp.mtx.Lock()
|
sp.mtx.Lock()
|
||||||
|
@ -235,8 +266,8 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) {
|
||||||
|
|
||||||
client, err := config_util.NewClientFromConfig(cfg.HTTPClientConfig, cfg.JobName)
|
client, err := config_util.NewClientFromConfig(cfg.HTTPClientConfig, cfg.JobName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Any errors that could occur here should be caught during config validation.
|
targetScrapePoolReloadsFailed.Inc()
|
||||||
level.Error(sp.logger).Log("msg", "Error creating HTTP client", "err", err)
|
return errors.Wrap(err, "error creating HTTP client")
|
||||||
}
|
}
|
||||||
sp.config = cfg
|
sp.config = cfg
|
||||||
sp.client = client
|
sp.client = client
|
||||||
|
@ -272,6 +303,7 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) {
|
||||||
targetReloadIntervalLength.WithLabelValues(interval.String()).Observe(
|
targetReloadIntervalLength.WithLabelValues(interval.String()).Observe(
|
||||||
time.Since(start).Seconds(),
|
time.Since(start).Seconds(),
|
||||||
)
|
)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sync converts target groups into actual scrape targets and synchronizes
|
// Sync converts target groups into actual scrape targets and synchronizes
|
||||||
|
|
|
@ -29,16 +29,14 @@ import (
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/prometheus/prometheus/pkg/relabel"
|
dto "github.com/prometheus/client_model/go"
|
||||||
|
|
||||||
"github.com/prometheus/common/model"
|
"github.com/prometheus/common/model"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
dto "github.com/prometheus/client_model/go"
|
|
||||||
|
|
||||||
"github.com/prometheus/prometheus/config"
|
"github.com/prometheus/prometheus/config"
|
||||||
"github.com/prometheus/prometheus/discovery/targetgroup"
|
"github.com/prometheus/prometheus/discovery/targetgroup"
|
||||||
"github.com/prometheus/prometheus/pkg/labels"
|
"github.com/prometheus/prometheus/pkg/labels"
|
||||||
|
"github.com/prometheus/prometheus/pkg/relabel"
|
||||||
"github.com/prometheus/prometheus/pkg/textparse"
|
"github.com/prometheus/prometheus/pkg/textparse"
|
||||||
"github.com/prometheus/prometheus/pkg/timestamp"
|
"github.com/prometheus/prometheus/pkg/timestamp"
|
||||||
"github.com/prometheus/prometheus/pkg/value"
|
"github.com/prometheus/prometheus/pkg/value"
|
||||||
|
@ -48,9 +46,9 @@ import (
|
||||||
|
|
||||||
func TestNewScrapePool(t *testing.T) {
|
func TestNewScrapePool(t *testing.T) {
|
||||||
var (
|
var (
|
||||||
app = &nopAppendable{}
|
app = &nopAppendable{}
|
||||||
cfg = &config.ScrapeConfig{}
|
cfg = &config.ScrapeConfig{}
|
||||||
sp = newScrapePool(cfg, app, nil)
|
sp, _ = newScrapePool(cfg, app, nil)
|
||||||
)
|
)
|
||||||
|
|
||||||
if a, ok := sp.appendable.(*nopAppendable); !ok || a != app {
|
if a, ok := sp.appendable.(*nopAppendable); !ok || a != app {
|
||||||
|
@ -85,7 +83,7 @@ func TestDroppedTargetsList(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
sp = newScrapePool(cfg, app, nil)
|
sp, _ = newScrapePool(cfg, app, nil)
|
||||||
expectedLabelSetString = "{__address__=\"127.0.0.1:9090\", __metrics_path__=\"\", __scheme__=\"\", job=\"dropMe\"}"
|
expectedLabelSetString = "{__address__=\"127.0.0.1:9090\", __metrics_path__=\"\", __scheme__=\"\", job=\"dropMe\"}"
|
||||||
expectedLength = 1
|
expectedLength = 1
|
||||||
)
|
)
|
||||||
|
@ -307,7 +305,7 @@ func TestScrapePoolReload(t *testing.T) {
|
||||||
func TestScrapePoolAppender(t *testing.T) {
|
func TestScrapePoolAppender(t *testing.T) {
|
||||||
cfg := &config.ScrapeConfig{}
|
cfg := &config.ScrapeConfig{}
|
||||||
app := &nopAppendable{}
|
app := &nopAppendable{}
|
||||||
sp := newScrapePool(cfg, app, nil)
|
sp, _ := newScrapePool(cfg, app, nil)
|
||||||
|
|
||||||
loop := sp.newLoop(&Target{}, nil, 0, false, nil)
|
loop := sp.newLoop(&Target{}, nil, 0, false, nil)
|
||||||
appl, ok := loop.(*scrapeLoop)
|
appl, ok := loop.(*scrapeLoop)
|
||||||
|
@ -350,7 +348,7 @@ func TestScrapePoolRaces(t *testing.T) {
|
||||||
newConfig := func() *config.ScrapeConfig {
|
newConfig := func() *config.ScrapeConfig {
|
||||||
return &config.ScrapeConfig{ScrapeInterval: interval, ScrapeTimeout: timeout}
|
return &config.ScrapeConfig{ScrapeInterval: interval, ScrapeTimeout: timeout}
|
||||||
}
|
}
|
||||||
sp := newScrapePool(newConfig(), &nopAppendable{}, nil)
|
sp, _ := newScrapePool(newConfig(), &nopAppendable{}, nil)
|
||||||
tgts := []*targetgroup.Group{
|
tgts := []*targetgroup.Group{
|
||||||
{
|
{
|
||||||
Targets: []model.LabelSet{
|
Targets: []model.LabelSet{
|
||||||
|
@ -880,7 +878,7 @@ func TestScrapeLoopAppendSampleLimit(t *testing.T) {
|
||||||
t.Fatalf("Did not see expected sample limit error: %s", err)
|
t.Fatalf("Did not see expected sample limit error: %s", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check that the Counter has been incremented a simgle time for the scrape,
|
// Check that the Counter has been incremented a single time for the scrape,
|
||||||
// not multiple times for each sample.
|
// not multiple times for each sample.
|
||||||
metric := dto.Metric{}
|
metric := dto.Metric{}
|
||||||
err = targetScrapeSampleLimit.Write(&metric)
|
err = targetScrapeSampleLimit.Write(&metric)
|
||||||
|
|
|
@ -70,15 +70,10 @@ type recoverableError struct {
|
||||||
error
|
error
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store sends a batch of samples to the HTTP endpoint.
|
// Store sends a batch of samples to the HTTP endpoint, the request is the proto marshalled
|
||||||
func (c *Client) Store(ctx context.Context, req *prompb.WriteRequest) error {
|
// and encoded bytes from codec.go.
|
||||||
data, err := proto.Marshal(req)
|
func (c *Client) Store(ctx context.Context, req []byte) error {
|
||||||
if err != nil {
|
httpReq, err := http.NewRequest("POST", c.url.String(), bytes.NewReader(req))
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
compressed := snappy.Encode(nil, data)
|
|
||||||
httpReq, err := http.NewRequest("POST", c.url.String(), bytes.NewReader(compressed))
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Errors from NewRequest are from unparseable URLs, so are not
|
// Errors from NewRequest are from unparseable URLs, so are not
|
||||||
// recoverable.
|
// recoverable.
|
||||||
|
|
|
@ -26,7 +26,6 @@ import (
|
||||||
|
|
||||||
config_util "github.com/prometheus/common/config"
|
config_util "github.com/prometheus/common/config"
|
||||||
"github.com/prometheus/common/model"
|
"github.com/prometheus/common/model"
|
||||||
"github.com/prometheus/prometheus/prompb"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var longErrMessage = strings.Repeat("error message", maxErrMsgLen)
|
var longErrMessage = strings.Repeat("error message", maxErrMsgLen)
|
||||||
|
@ -74,7 +73,7 @@ func TestStoreHTTPErrorHandling(t *testing.T) {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = c.Store(context.Background(), &prompb.WriteRequest{})
|
err = c.Store(context.Background(), []byte{})
|
||||||
if !reflect.DeepEqual(err, test.err) {
|
if !reflect.DeepEqual(err, test.err) {
|
||||||
t.Errorf("%d. Unexpected error; want %v, got %v", i, test.err, err)
|
t.Errorf("%d. Unexpected error; want %v, got %v", i, test.err, err)
|
||||||
}
|
}
|
||||||
|
|
|
@ -80,28 +80,6 @@ func EncodeReadResponse(resp *prompb.ReadResponse, w http.ResponseWriter) error
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// ToWriteRequest converts an array of samples into a WriteRequest proto.
|
|
||||||
func ToWriteRequest(samples []*model.Sample) *prompb.WriteRequest {
|
|
||||||
req := &prompb.WriteRequest{
|
|
||||||
Timeseries: make([]prompb.TimeSeries, 0, len(samples)),
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, s := range samples {
|
|
||||||
ts := prompb.TimeSeries{
|
|
||||||
Labels: MetricToLabelProtos(s.Metric),
|
|
||||||
Samples: []prompb.Sample{
|
|
||||||
{
|
|
||||||
Value: float64(s.Value),
|
|
||||||
Timestamp: int64(s.Timestamp),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
req.Timeseries = append(req.Timeseries, ts)
|
|
||||||
}
|
|
||||||
|
|
||||||
return req
|
|
||||||
}
|
|
||||||
|
|
||||||
// ToQuery builds a Query proto.
|
// ToQuery builds a Query proto.
|
||||||
func ToQuery(from, to int64, matchers []*labels.Matcher, p *storage.SelectParams) (*prompb.Query, error) {
|
func ToQuery(from, to int64, matchers []*labels.Matcher, p *storage.SelectParams) (*prompb.Query, error) {
|
||||||
ms, err := toLabelMatchers(matchers)
|
ms, err := toLabelMatchers(matchers)
|
||||||
|
@ -364,21 +342,6 @@ func fromLabelMatchers(matchers []*prompb.LabelMatcher) ([]*labels.Matcher, erro
|
||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// MetricToLabelProtos builds a []*prompb.Label from a model.Metric
|
|
||||||
func MetricToLabelProtos(metric model.Metric) []prompb.Label {
|
|
||||||
labels := make([]prompb.Label, 0, len(metric))
|
|
||||||
for k, v := range metric {
|
|
||||||
labels = append(labels, prompb.Label{
|
|
||||||
Name: string(k),
|
|
||||||
Value: string(v),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
sort.Slice(labels, func(i int, j int) bool {
|
|
||||||
return labels[i].Name < labels[j].Name
|
|
||||||
})
|
|
||||||
return labels
|
|
||||||
}
|
|
||||||
|
|
||||||
// LabelProtosToMetric unpack a []*prompb.Label to a model.Metric
|
// LabelProtosToMetric unpack a []*prompb.Label to a model.Metric
|
||||||
func LabelProtosToMetric(labelPairs []*prompb.Label) model.Metric {
|
func LabelProtosToMetric(labelPairs []*prompb.Label) model.Metric {
|
||||||
metric := make(model.Metric, len(labelPairs))
|
metric := make(model.Metric, len(labelPairs))
|
||||||
|
@ -400,6 +363,26 @@ func labelProtosToLabels(labelPairs []prompb.Label) labels.Labels {
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func labelsetToLabelsProto(ls model.LabelSet) []prompb.Label {
|
||||||
|
result := make([]prompb.Label, 0, len(ls))
|
||||||
|
keys := make([]string, 0, len(ls))
|
||||||
|
|
||||||
|
for k := range ls {
|
||||||
|
keys = append(keys, string(k))
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
|
||||||
|
for _, k := range keys {
|
||||||
|
ln := model.LabelName(k)
|
||||||
|
result = append(result, prompb.Label{
|
||||||
|
Name: k,
|
||||||
|
Value: string(ls[ln]),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
func labelsToLabelsProto(labels labels.Labels) []prompb.Label {
|
func labelsToLabelsProto(labels labels.Labels) []prompb.Label {
|
||||||
result := make([]prompb.Label, 0, len(labels))
|
result := make([]prompb.Label, 0, len(labels))
|
||||||
for _, l := range labels {
|
for _, l := range labels {
|
||||||
|
@ -410,11 +393,3 @@ func labelsToLabelsProto(labels labels.Labels) []prompb.Label {
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
func labelsToMetric(ls labels.Labels) model.Metric {
|
|
||||||
metric := make(model.Metric, len(ls))
|
|
||||||
for _, l := range ls {
|
|
||||||
metric[model.LabelName(l.Name)] = model.LabelValue(l.Value)
|
|
||||||
}
|
|
||||||
return metric
|
|
||||||
}
|
|
||||||
|
|
|
@ -16,6 +16,7 @@ package remote
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"math"
|
"math"
|
||||||
|
"strconv"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
@ -24,12 +25,16 @@ import (
|
||||||
|
|
||||||
"github.com/go-kit/kit/log"
|
"github.com/go-kit/kit/log"
|
||||||
"github.com/go-kit/kit/log/level"
|
"github.com/go-kit/kit/log/level"
|
||||||
|
"github.com/gogo/protobuf/proto"
|
||||||
|
"github.com/golang/snappy"
|
||||||
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/prometheus/common/model"
|
"github.com/prometheus/common/model"
|
||||||
"github.com/prometheus/prometheus/config"
|
"github.com/prometheus/prometheus/config"
|
||||||
pkgrelabel "github.com/prometheus/prometheus/pkg/relabel"
|
pkgrelabel "github.com/prometheus/prometheus/pkg/relabel"
|
||||||
"github.com/prometheus/prometheus/prompb"
|
"github.com/prometheus/prometheus/prompb"
|
||||||
"github.com/prometheus/prometheus/relabel"
|
"github.com/prometheus/prometheus/relabel"
|
||||||
|
"github.com/prometheus/tsdb"
|
||||||
)
|
)
|
||||||
|
|
||||||
// String constants for instrumentation.
|
// String constants for instrumentation.
|
||||||
|
@ -66,7 +71,16 @@ var (
|
||||||
Namespace: namespace,
|
Namespace: namespace,
|
||||||
Subsystem: subsystem,
|
Subsystem: subsystem,
|
||||||
Name: "failed_samples_total",
|
Name: "failed_samples_total",
|
||||||
Help: "Total number of samples which failed on send to remote storage.",
|
Help: "Total number of samples which failed on send to remote storage, non-recoverable errors.",
|
||||||
|
},
|
||||||
|
[]string{queue},
|
||||||
|
)
|
||||||
|
retriedSamplesTotal = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: subsystem,
|
||||||
|
Name: "retried_samples_total",
|
||||||
|
Help: "Total number of samples which failed on send to remote storage but were retried because the send error was recoverable.",
|
||||||
},
|
},
|
||||||
[]string{queue},
|
[]string{queue},
|
||||||
)
|
)
|
||||||
|
@ -75,7 +89,16 @@ var (
|
||||||
Namespace: namespace,
|
Namespace: namespace,
|
||||||
Subsystem: subsystem,
|
Subsystem: subsystem,
|
||||||
Name: "dropped_samples_total",
|
Name: "dropped_samples_total",
|
||||||
Help: "Total number of samples which were dropped due to the queue being full.",
|
Help: "Total number of samples which were dropped after being read from the WAL before being sent via remote write.",
|
||||||
|
},
|
||||||
|
[]string{queue},
|
||||||
|
)
|
||||||
|
enqueueRetriesTotal = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: subsystem,
|
||||||
|
Name: "enqueue_retries_total",
|
||||||
|
Help: "Total number of times enqueue has failed because a shards queue was full.",
|
||||||
},
|
},
|
||||||
[]string{queue},
|
[]string{queue},
|
||||||
)
|
)
|
||||||
|
@ -89,12 +112,30 @@ var (
|
||||||
},
|
},
|
||||||
[]string{queue},
|
[]string{queue},
|
||||||
)
|
)
|
||||||
queueLength = prometheus.NewGaugeVec(
|
queueLastSendTimestamp = prometheus.NewGaugeVec(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
Namespace: namespace,
|
Namespace: namespace,
|
||||||
Subsystem: subsystem,
|
Subsystem: subsystem,
|
||||||
Name: "queue_length",
|
Name: "queue_last_send_timestamp",
|
||||||
Help: "The number of processed samples queued to be sent to the remote storage.",
|
Help: "Timestamp of the last successful send by this queue.",
|
||||||
|
},
|
||||||
|
[]string{queue},
|
||||||
|
)
|
||||||
|
queueHighestSentTimestamp = prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: subsystem,
|
||||||
|
Name: "queue_highest_sent_timestamp",
|
||||||
|
Help: "Timestamp from a WAL sample, the highest timestamp successfully sent by this queue.",
|
||||||
|
},
|
||||||
|
[]string{queue},
|
||||||
|
)
|
||||||
|
queuePendingSamples = prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: subsystem,
|
||||||
|
Name: "pending_samples",
|
||||||
|
Help: "The number of samples pending in the queues shards to be sent to the remote storage.",
|
||||||
},
|
},
|
||||||
[]string{queue},
|
[]string{queue},
|
||||||
)
|
)
|
||||||
|
@ -121,9 +162,13 @@ var (
|
||||||
func init() {
|
func init() {
|
||||||
prometheus.MustRegister(succeededSamplesTotal)
|
prometheus.MustRegister(succeededSamplesTotal)
|
||||||
prometheus.MustRegister(failedSamplesTotal)
|
prometheus.MustRegister(failedSamplesTotal)
|
||||||
|
prometheus.MustRegister(retriedSamplesTotal)
|
||||||
prometheus.MustRegister(droppedSamplesTotal)
|
prometheus.MustRegister(droppedSamplesTotal)
|
||||||
|
prometheus.MustRegister(enqueueRetriesTotal)
|
||||||
prometheus.MustRegister(sentBatchDuration)
|
prometheus.MustRegister(sentBatchDuration)
|
||||||
prometheus.MustRegister(queueLength)
|
prometheus.MustRegister(queueLastSendTimestamp)
|
||||||
|
prometheus.MustRegister(queueHighestSentTimestamp)
|
||||||
|
prometheus.MustRegister(queuePendingSamples)
|
||||||
prometheus.MustRegister(shardCapacity)
|
prometheus.MustRegister(shardCapacity)
|
||||||
prometheus.MustRegister(numShards)
|
prometheus.MustRegister(numShards)
|
||||||
}
|
}
|
||||||
|
@ -132,25 +177,41 @@ func init() {
|
||||||
// external timeseries database.
|
// external timeseries database.
|
||||||
type StorageClient interface {
|
type StorageClient interface {
|
||||||
// Store stores the given samples in the remote storage.
|
// Store stores the given samples in the remote storage.
|
||||||
Store(context.Context, *prompb.WriteRequest) error
|
Store(context.Context, []byte) error
|
||||||
// Name identifies the remote storage implementation.
|
// Name identifies the remote storage implementation.
|
||||||
Name() string
|
Name() string
|
||||||
}
|
}
|
||||||
|
|
||||||
// QueueManager manages a queue of samples to be sent to the Storage
|
// QueueManager manages a queue of samples to be sent to the Storage
|
||||||
// indicated by the provided StorageClient.
|
// indicated by the provided StorageClient. Implements writeTo interface
|
||||||
|
// used by WAL Watcher.
|
||||||
type QueueManager struct {
|
type QueueManager struct {
|
||||||
logger log.Logger
|
logger log.Logger
|
||||||
|
|
||||||
flushDeadline time.Duration
|
flushDeadline time.Duration
|
||||||
cfg config.QueueConfig
|
cfg config.QueueConfig
|
||||||
externalLabels model.LabelSet
|
externalLabels model.LabelSet
|
||||||
relabelConfigs []*pkgrelabel.Config
|
relabelConfigs []*pkgrelabel.Config
|
||||||
client StorageClient
|
client StorageClient
|
||||||
queueName string
|
queueName string
|
||||||
logLimiter *rate.Limiter
|
logLimiter *rate.Limiter
|
||||||
|
watcher *WALWatcher
|
||||||
|
lastSendTimestampMetric prometheus.Gauge
|
||||||
|
highestSentTimestampMetric prometheus.Gauge
|
||||||
|
pendingSamplesMetric prometheus.Gauge
|
||||||
|
enqueueRetriesMetric prometheus.Counter
|
||||||
|
|
||||||
|
lastSendTimestamp int64
|
||||||
|
highestSentTimestamp int64
|
||||||
|
timestampLock sync.Mutex
|
||||||
|
|
||||||
|
highestTimestampIn *int64 // highest timestamp of any sample ingested by remote storage via scrape (Appender)
|
||||||
|
|
||||||
|
seriesMtx sync.Mutex
|
||||||
|
seriesLabels map[uint64][]prompb.Label
|
||||||
|
seriesSegmentIndexes map[uint64]int
|
||||||
|
droppedSeries map[uint64]struct{}
|
||||||
|
|
||||||
shardsMtx sync.RWMutex
|
|
||||||
shards *shards
|
shards *shards
|
||||||
numShards int
|
numShards int
|
||||||
reshardChan chan int
|
reshardChan chan int
|
||||||
|
@ -162,7 +223,7 @@ type QueueManager struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewQueueManager builds a new QueueManager.
|
// NewQueueManager builds a new QueueManager.
|
||||||
func NewQueueManager(logger log.Logger, cfg config.QueueConfig, externalLabels model.LabelSet, relabelConfigs []*pkgrelabel.Config, client StorageClient, flushDeadline time.Duration) *QueueManager {
|
func NewQueueManager(logger log.Logger, walDir string, samplesIn *ewmaRate, highestTimestampIn *int64, cfg config.QueueConfig, externalLabels model.LabelSet, relabelConfigs []*pkgrelabel.Config, client StorageClient, flushDeadline time.Duration, startTime int64) *QueueManager {
|
||||||
if logger == nil {
|
if logger == nil {
|
||||||
logger = log.NewNopLogger()
|
logger = log.NewNopLogger()
|
||||||
} else {
|
} else {
|
||||||
|
@ -177,16 +238,29 @@ func NewQueueManager(logger log.Logger, cfg config.QueueConfig, externalLabels m
|
||||||
client: client,
|
client: client,
|
||||||
queueName: client.Name(),
|
queueName: client.Name(),
|
||||||
|
|
||||||
|
highestTimestampIn: highestTimestampIn,
|
||||||
|
|
||||||
|
seriesLabels: make(map[uint64][]prompb.Label),
|
||||||
|
seriesSegmentIndexes: make(map[uint64]int),
|
||||||
|
droppedSeries: make(map[uint64]struct{}),
|
||||||
|
|
||||||
logLimiter: rate.NewLimiter(logRateLimit, logBurst),
|
logLimiter: rate.NewLimiter(logRateLimit, logBurst),
|
||||||
numShards: cfg.MinShards,
|
numShards: cfg.MinShards,
|
||||||
reshardChan: make(chan int),
|
reshardChan: make(chan int),
|
||||||
quit: make(chan struct{}),
|
quit: make(chan struct{}),
|
||||||
|
|
||||||
samplesIn: newEWMARate(ewmaWeight, shardUpdateDuration),
|
samplesIn: samplesIn,
|
||||||
samplesOut: newEWMARate(ewmaWeight, shardUpdateDuration),
|
samplesOut: newEWMARate(ewmaWeight, shardUpdateDuration),
|
||||||
samplesOutDuration: newEWMARate(ewmaWeight, shardUpdateDuration),
|
samplesOutDuration: newEWMARate(ewmaWeight, shardUpdateDuration),
|
||||||
}
|
}
|
||||||
t.shards = t.newShards(t.numShards)
|
|
||||||
|
t.lastSendTimestampMetric = queueLastSendTimestamp.WithLabelValues(t.queueName)
|
||||||
|
t.highestSentTimestampMetric = queueHighestSentTimestamp.WithLabelValues(t.queueName)
|
||||||
|
t.pendingSamplesMetric = queuePendingSamples.WithLabelValues(t.queueName)
|
||||||
|
t.enqueueRetriesMetric = enqueueRetriesTotal.WithLabelValues(t.queueName)
|
||||||
|
t.watcher = NewWALWatcher(logger, client.Name(), t, walDir, startTime)
|
||||||
|
t.shards = t.newShards()
|
||||||
|
|
||||||
numShards.WithLabelValues(t.queueName).Set(float64(t.numShards))
|
numShards.WithLabelValues(t.queueName).Set(float64(t.numShards))
|
||||||
shardCapacity.WithLabelValues(t.queueName).Set(float64(t.cfg.Capacity))
|
shardCapacity.WithLabelValues(t.queueName).Set(float64(t.cfg.Capacity))
|
||||||
|
|
||||||
|
@ -195,76 +269,144 @@ func NewQueueManager(logger log.Logger, cfg config.QueueConfig, externalLabels m
|
||||||
succeededSamplesTotal.WithLabelValues(t.queueName)
|
succeededSamplesTotal.WithLabelValues(t.queueName)
|
||||||
failedSamplesTotal.WithLabelValues(t.queueName)
|
failedSamplesTotal.WithLabelValues(t.queueName)
|
||||||
droppedSamplesTotal.WithLabelValues(t.queueName)
|
droppedSamplesTotal.WithLabelValues(t.queueName)
|
||||||
|
retriedSamplesTotal.WithLabelValues(t.queueName)
|
||||||
|
// Reset pending samples metric to 0.
|
||||||
|
t.pendingSamplesMetric.Set(0)
|
||||||
|
|
||||||
return t
|
return t
|
||||||
}
|
}
|
||||||
|
|
||||||
// Append queues a sample to be sent to the remote storage. It drops the
|
// Append queues a sample to be sent to the remote storage. Blocks until all samples are
|
||||||
// sample on the floor if the queue is full.
|
// enqueued on their shards or a shutdown signal is received.
|
||||||
// Always returns nil.
|
func (t *QueueManager) Append(s []tsdb.RefSample) bool {
|
||||||
func (t *QueueManager) Append(s *model.Sample) error {
|
type enqueuable struct {
|
||||||
snew := *s
|
ts prompb.TimeSeries
|
||||||
snew.Metric = s.Metric.Clone()
|
ref uint64
|
||||||
|
}
|
||||||
|
|
||||||
for ln, lv := range t.externalLabels {
|
tempSamples := make([]enqueuable, 0, len(s))
|
||||||
if _, ok := s.Metric[ln]; !ok {
|
t.seriesMtx.Lock()
|
||||||
snew.Metric[ln] = lv
|
for _, sample := range s {
|
||||||
|
// If we have no labels for the series, due to relabelling or otherwise, don't send the sample.
|
||||||
|
if _, ok := t.seriesLabels[sample.Ref]; !ok {
|
||||||
|
droppedSamplesTotal.WithLabelValues(t.queueName).Inc()
|
||||||
|
if _, ok := t.droppedSeries[sample.Ref]; !ok && t.logLimiter.Allow() {
|
||||||
|
level.Info(t.logger).Log("msg", "dropped sample for series that was not explicitly dropped via relabelling", "ref", sample.Ref)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
tempSamples = append(tempSamples, enqueuable{
|
||||||
|
ts: prompb.TimeSeries{
|
||||||
|
Labels: t.seriesLabels[sample.Ref],
|
||||||
|
Samples: []prompb.Sample{
|
||||||
|
prompb.Sample{
|
||||||
|
Value: float64(sample.V),
|
||||||
|
Timestamp: sample.T,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
ref: sample.Ref,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
t.seriesMtx.Unlock()
|
||||||
|
|
||||||
|
outer:
|
||||||
|
for _, sample := range tempSamples {
|
||||||
|
// This will only loop if the queues are being resharded.
|
||||||
|
backoff := t.cfg.MinBackoff
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-t.quit:
|
||||||
|
return false
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
if t.shards.enqueue(sample.ref, sample.ts) {
|
||||||
|
continue outer
|
||||||
|
}
|
||||||
|
|
||||||
|
t.enqueueRetriesMetric.Inc()
|
||||||
|
time.Sleep(time.Duration(backoff))
|
||||||
|
backoff = backoff * 2
|
||||||
|
if backoff > t.cfg.MaxBackoff {
|
||||||
|
backoff = t.cfg.MaxBackoff
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return true
|
||||||
snew.Metric = model.Metric(
|
|
||||||
relabel.Process(model.LabelSet(snew.Metric), t.relabelConfigs...))
|
|
||||||
|
|
||||||
if snew.Metric == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
t.shardsMtx.RLock()
|
|
||||||
enqueued := t.shards.enqueue(&snew)
|
|
||||||
t.shardsMtx.RUnlock()
|
|
||||||
|
|
||||||
if enqueued {
|
|
||||||
queueLength.WithLabelValues(t.queueName).Inc()
|
|
||||||
} else {
|
|
||||||
droppedSamplesTotal.WithLabelValues(t.queueName).Inc()
|
|
||||||
if t.logLimiter.Allow() {
|
|
||||||
level.Warn(t.logger).Log("msg", "Remote storage queue full, discarding sample. Multiple subsequent messages of this kind may be suppressed.")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// NeedsThrottling implements storage.SampleAppender. It will always return
|
|
||||||
// false as a remote storage drops samples on the floor if backlogging instead
|
|
||||||
// of asking for throttling.
|
|
||||||
func (*QueueManager) NeedsThrottling() bool {
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start the queue manager sending samples to the remote storage.
|
// Start the queue manager sending samples to the remote storage.
|
||||||
// Does not block.
|
// Does not block.
|
||||||
func (t *QueueManager) Start() {
|
func (t *QueueManager) Start() {
|
||||||
|
t.shards.start(t.numShards)
|
||||||
|
t.watcher.Start()
|
||||||
|
|
||||||
t.wg.Add(2)
|
t.wg.Add(2)
|
||||||
go t.updateShardsLoop()
|
go t.updateShardsLoop()
|
||||||
go t.reshardLoop()
|
go t.reshardLoop()
|
||||||
|
|
||||||
t.shardsMtx.Lock()
|
|
||||||
defer t.shardsMtx.Unlock()
|
|
||||||
t.shards.start()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stop stops sending samples to the remote storage and waits for pending
|
// Stop stops sending samples to the remote storage and waits for pending
|
||||||
// sends to complete.
|
// sends to complete.
|
||||||
func (t *QueueManager) Stop() {
|
func (t *QueueManager) Stop() {
|
||||||
level.Info(t.logger).Log("msg", "Stopping remote storage...")
|
level.Info(t.logger).Log("msg", "Stopping remote storage...")
|
||||||
|
defer level.Info(t.logger).Log("msg", "Remote storage stopped.")
|
||||||
|
|
||||||
close(t.quit)
|
close(t.quit)
|
||||||
|
t.shards.stop()
|
||||||
|
t.watcher.Stop()
|
||||||
t.wg.Wait()
|
t.wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
t.shardsMtx.Lock()
|
// StoreSeries keeps track of which series we know about for lookups when sending samples to remote.
|
||||||
defer t.shardsMtx.Unlock()
|
func (t *QueueManager) StoreSeries(series []tsdb.RefSeries, index int) {
|
||||||
t.shards.stop(t.flushDeadline)
|
temp := make(map[uint64][]prompb.Label, len(series))
|
||||||
|
for _, s := range series {
|
||||||
|
ls := make(model.LabelSet, len(s.Labels))
|
||||||
|
for _, label := range s.Labels {
|
||||||
|
ls[model.LabelName(label.Name)] = model.LabelValue(label.Value)
|
||||||
|
}
|
||||||
|
t.processExternalLabels(ls)
|
||||||
|
rl := relabel.Process(ls, t.relabelConfigs...)
|
||||||
|
if len(rl) == 0 {
|
||||||
|
t.droppedSeries[s.Ref] = struct{}{}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
temp[s.Ref] = labelsetToLabelsProto(rl)
|
||||||
|
}
|
||||||
|
|
||||||
level.Info(t.logger).Log("msg", "Remote storage stopped.")
|
t.seriesMtx.Lock()
|
||||||
|
defer t.seriesMtx.Unlock()
|
||||||
|
for ref, labels := range temp {
|
||||||
|
t.seriesLabels[ref] = labels
|
||||||
|
t.seriesSegmentIndexes[ref] = index
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// SeriesReset is used when reading a checkpoint. WAL Watcher should have
|
||||||
|
// stored series records with the checkpoints index number, so we can now
|
||||||
|
// delete any ref ID's lower than that # from the two maps.
|
||||||
|
func (t *QueueManager) SeriesReset(index int) {
|
||||||
|
t.seriesMtx.Lock()
|
||||||
|
defer t.seriesMtx.Unlock()
|
||||||
|
|
||||||
|
// Check for series that are in segments older than the checkpoint
|
||||||
|
// that were not also present in the checkpoint.
|
||||||
|
for k, v := range t.seriesSegmentIndexes {
|
||||||
|
if v < index {
|
||||||
|
delete(t.seriesLabels, k)
|
||||||
|
delete(t.seriesSegmentIndexes, k)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *QueueManager) processExternalLabels(ls model.LabelSet) {
|
||||||
|
for ln, lv := range t.externalLabels {
|
||||||
|
if _, ok := ls[ln]; !ok {
|
||||||
|
ls[ln] = lv
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *QueueManager) updateShardsLoop() {
|
func (t *QueueManager) updateShardsLoop() {
|
||||||
|
@ -275,6 +417,12 @@ func (t *QueueManager) updateShardsLoop() {
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
|
now := time.Now().Unix()
|
||||||
|
threshold := int64(time.Duration(2 * t.cfg.BatchSendDeadline).Seconds())
|
||||||
|
if now-t.lastSendTimestamp > threshold {
|
||||||
|
level.Debug(t.logger).Log("msg", "Skipping resharding, last successful send was beyond threshold")
|
||||||
|
continue
|
||||||
|
}
|
||||||
t.calculateDesiredShards()
|
t.calculateDesiredShards()
|
||||||
case <-t.quit:
|
case <-t.quit:
|
||||||
return
|
return
|
||||||
|
@ -351,107 +499,150 @@ func (t *QueueManager) reshardLoop() {
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case numShards := <-t.reshardChan:
|
case numShards := <-t.reshardChan:
|
||||||
t.reshard(numShards)
|
// We start the newShards after we have stopped (the therefore completely
|
||||||
|
// flushed) the oldShards, to guarantee we only every deliver samples in
|
||||||
|
// order.
|
||||||
|
t.shards.stop()
|
||||||
|
t.shards.start(numShards)
|
||||||
case <-t.quit:
|
case <-t.quit:
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *QueueManager) reshard(n int) {
|
func (t *QueueManager) newShards() *shards {
|
||||||
numShards.WithLabelValues(t.queueName).Set(float64(n))
|
|
||||||
|
|
||||||
t.shardsMtx.Lock()
|
|
||||||
newShards := t.newShards(n)
|
|
||||||
oldShards := t.shards
|
|
||||||
t.shards = newShards
|
|
||||||
t.shardsMtx.Unlock()
|
|
||||||
|
|
||||||
oldShards.stop(t.flushDeadline)
|
|
||||||
|
|
||||||
// We start the newShards after we have stopped (the therefore completely
|
|
||||||
// flushed) the oldShards, to guarantee we only every deliver samples in
|
|
||||||
// order.
|
|
||||||
newShards.start()
|
|
||||||
}
|
|
||||||
|
|
||||||
type shards struct {
|
|
||||||
qm *QueueManager
|
|
||||||
queues []chan *model.Sample
|
|
||||||
done chan struct{}
|
|
||||||
running int32
|
|
||||||
ctx context.Context
|
|
||||||
cancel context.CancelFunc
|
|
||||||
}
|
|
||||||
|
|
||||||
func (t *QueueManager) newShards(numShards int) *shards {
|
|
||||||
queues := make([]chan *model.Sample, numShards)
|
|
||||||
for i := 0; i < numShards; i++ {
|
|
||||||
queues[i] = make(chan *model.Sample, t.cfg.Capacity)
|
|
||||||
}
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
|
||||||
s := &shards{
|
s := &shards{
|
||||||
qm: t,
|
qm: t,
|
||||||
queues: queues,
|
done: make(chan struct{}),
|
||||||
done: make(chan struct{}),
|
|
||||||
running: int32(numShards),
|
|
||||||
ctx: ctx,
|
|
||||||
cancel: cancel,
|
|
||||||
}
|
}
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *shards) start() {
|
// Check and set highestSentTimestamp
|
||||||
for i := 0; i < len(s.queues); i++ {
|
func (t *QueueManager) setHighestSentTimestamp(highest int64) {
|
||||||
go s.runShard(i)
|
t.timestampLock.Lock()
|
||||||
|
defer t.timestampLock.Unlock()
|
||||||
|
if highest > t.highestSentTimestamp {
|
||||||
|
t.highestSentTimestamp = highest
|
||||||
|
t.highestSentTimestampMetric.Set(float64(t.highestSentTimestamp))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *shards) stop(deadline time.Duration) {
|
func (t *QueueManager) setLastSendTimestamp(now time.Time) {
|
||||||
// Attempt a clean shutdown.
|
t.timestampLock.Lock()
|
||||||
for _, shard := range s.queues {
|
defer t.timestampLock.Unlock()
|
||||||
close(shard)
|
t.lastSendTimestampMetric.Set(float64(now.UnixNano()) / 1e9)
|
||||||
|
t.lastSendTimestamp = now.Unix()
|
||||||
|
}
|
||||||
|
|
||||||
|
type shards struct {
|
||||||
|
mtx sync.RWMutex // With the WAL, this is never actually contended.
|
||||||
|
|
||||||
|
qm *QueueManager
|
||||||
|
queues []chan prompb.TimeSeries
|
||||||
|
|
||||||
|
// Emulate a wait group with a channel and an atomic int, as you
|
||||||
|
// cannot select on a wait group.
|
||||||
|
done chan struct{}
|
||||||
|
running int32
|
||||||
|
|
||||||
|
// Soft shutdown context will prevent new enqueues and deadlocks.
|
||||||
|
softShutdown chan struct{}
|
||||||
|
|
||||||
|
// Hard shutdown context is used to terminate outgoing HTTP connections
|
||||||
|
// after giving them a chance to terminate.
|
||||||
|
hardShutdown context.CancelFunc
|
||||||
|
}
|
||||||
|
|
||||||
|
// start the shards; must be called before any call to enqueue.
|
||||||
|
func (s *shards) start(n int) {
|
||||||
|
s.mtx.Lock()
|
||||||
|
defer s.mtx.Unlock()
|
||||||
|
|
||||||
|
newQueues := make([]chan prompb.TimeSeries, n)
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
newQueues[i] = make(chan prompb.TimeSeries, s.qm.cfg.Capacity)
|
||||||
|
}
|
||||||
|
|
||||||
|
s.queues = newQueues
|
||||||
|
|
||||||
|
var hardShutdownCtx context.Context
|
||||||
|
hardShutdownCtx, s.hardShutdown = context.WithCancel(context.Background())
|
||||||
|
s.softShutdown = make(chan struct{})
|
||||||
|
s.running = int32(n)
|
||||||
|
s.done = make(chan struct{})
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
go s.runShard(hardShutdownCtx, i, newQueues[i])
|
||||||
|
}
|
||||||
|
numShards.WithLabelValues(s.qm.queueName).Set(float64(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
// stop the shards; subsequent call to enqueue will return false.
|
||||||
|
func (s *shards) stop() {
|
||||||
|
// Attempt a clean shutdown, but only wait flushDeadline for all the shards
|
||||||
|
// to cleanly exit. As we're doing RPCs, enqueue can block indefinately.
|
||||||
|
// We must be able so call stop concurrently, hence we can only take the
|
||||||
|
// RLock here.
|
||||||
|
s.mtx.RLock()
|
||||||
|
close(s.softShutdown)
|
||||||
|
s.mtx.RUnlock()
|
||||||
|
|
||||||
|
// Enqueue should now be unblocked, so we can take the write lock. This
|
||||||
|
// also ensures we don't race with writes to the queues, and get a panic:
|
||||||
|
// send on closed channel.
|
||||||
|
s.mtx.Lock()
|
||||||
|
defer s.mtx.Unlock()
|
||||||
|
for _, queue := range s.queues {
|
||||||
|
close(queue)
|
||||||
}
|
}
|
||||||
select {
|
select {
|
||||||
case <-s.done:
|
case <-s.done:
|
||||||
return
|
return
|
||||||
case <-time.After(deadline):
|
case <-time.After(s.qm.flushDeadline):
|
||||||
level.Error(s.qm.logger).Log("msg", "Failed to flush all samples on shutdown")
|
level.Error(s.qm.logger).Log("msg", "Failed to flush all samples on shutdown")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Force an unclean shutdown.
|
// Force an unclean shutdown.
|
||||||
s.cancel()
|
s.hardShutdown()
|
||||||
<-s.done
|
<-s.done
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *shards) enqueue(sample *model.Sample) bool {
|
// enqueue a sample. If we are currently in the process of shutting down or resharding,
|
||||||
s.qm.samplesIn.incr(1)
|
// will return false; in this case, you should back off and retry.
|
||||||
|
func (s *shards) enqueue(ref uint64, sample prompb.TimeSeries) bool {
|
||||||
fp := sample.Metric.FastFingerprint()
|
s.mtx.RLock()
|
||||||
shard := uint64(fp) % uint64(len(s.queues))
|
defer s.mtx.RUnlock()
|
||||||
|
|
||||||
select {
|
select {
|
||||||
|
case <-s.softShutdown:
|
||||||
|
return false
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
shard := uint64(ref) % uint64(len(s.queues))
|
||||||
|
select {
|
||||||
|
case <-s.softShutdown:
|
||||||
|
return false
|
||||||
case s.queues[shard] <- sample:
|
case s.queues[shard] <- sample:
|
||||||
return true
|
return true
|
||||||
default:
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *shards) runShard(i int) {
|
func (s *shards) runShard(ctx context.Context, i int, queue chan prompb.TimeSeries) {
|
||||||
defer func() {
|
defer func() {
|
||||||
if atomic.AddInt32(&s.running, -1) == 0 {
|
if atomic.AddInt32(&s.running, -1) == 0 {
|
||||||
close(s.done)
|
close(s.done)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
queue := s.queues[i]
|
shardNum := strconv.Itoa(i)
|
||||||
|
|
||||||
// Send batches of at most MaxSamplesPerSend samples to the remote storage.
|
// Send batches of at most MaxSamplesPerSend samples to the remote storage.
|
||||||
// If we have fewer samples than that, flush them out after a deadline
|
// If we have fewer samples than that, flush them out after a deadline
|
||||||
// anyways.
|
// anyways.
|
||||||
pendingSamples := model.Samples{}
|
pendingSamples := []prompb.TimeSeries{}
|
||||||
|
|
||||||
|
max := s.qm.cfg.MaxSamplesPerSend
|
||||||
timer := time.NewTimer(time.Duration(s.qm.cfg.BatchSendDeadline))
|
timer := time.NewTimer(time.Duration(s.qm.cfg.BatchSendDeadline))
|
||||||
stop := func() {
|
stop := func() {
|
||||||
if !timer.Stop() {
|
if !timer.Stop() {
|
||||||
|
@ -465,25 +656,29 @@ func (s *shards) runShard(i int) {
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-s.ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return
|
||||||
|
|
||||||
case sample, ok := <-queue:
|
case sample, ok := <-queue:
|
||||||
if !ok {
|
if !ok {
|
||||||
if len(pendingSamples) > 0 {
|
if len(pendingSamples) > 0 {
|
||||||
level.Debug(s.qm.logger).Log("msg", "Flushing samples to remote storage...", "count", len(pendingSamples))
|
level.Debug(s.qm.logger).Log("msg", "Flushing samples to remote storage...", "count", len(pendingSamples))
|
||||||
s.sendSamples(pendingSamples)
|
s.sendSamples(ctx, pendingSamples)
|
||||||
level.Debug(s.qm.logger).Log("msg", "Done flushing.")
|
level.Debug(s.qm.logger).Log("msg", "Done flushing.")
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
queueLength.WithLabelValues(s.qm.queueName).Dec()
|
// Number of pending samples is limited by the fact that sendSamples (via sendSamplesWithBackoff)
|
||||||
|
// retries endlessly, so once we reach > 100 samples, if we can never send to the endpoint we'll
|
||||||
|
// stop reading from the queue (which has a size of 10).
|
||||||
pendingSamples = append(pendingSamples, sample)
|
pendingSamples = append(pendingSamples, sample)
|
||||||
|
s.qm.pendingSamplesMetric.Inc()
|
||||||
|
|
||||||
if len(pendingSamples) >= s.qm.cfg.MaxSamplesPerSend {
|
if len(pendingSamples) >= max {
|
||||||
s.sendSamples(pendingSamples[:s.qm.cfg.MaxSamplesPerSend])
|
s.sendSamples(ctx, pendingSamples[:max])
|
||||||
pendingSamples = pendingSamples[s.qm.cfg.MaxSamplesPerSend:]
|
pendingSamples = pendingSamples[max:]
|
||||||
|
s.qm.pendingSamplesMetric.Sub(float64(max))
|
||||||
|
|
||||||
stop()
|
stop()
|
||||||
timer.Reset(time.Duration(s.qm.cfg.BatchSendDeadline))
|
timer.Reset(time.Duration(s.qm.cfg.BatchSendDeadline))
|
||||||
|
@ -491,17 +686,24 @@ func (s *shards) runShard(i int) {
|
||||||
|
|
||||||
case <-timer.C:
|
case <-timer.C:
|
||||||
if len(pendingSamples) > 0 {
|
if len(pendingSamples) > 0 {
|
||||||
s.sendSamples(pendingSamples)
|
level.Debug(s.qm.logger).Log("msg", "runShard timer ticked, sending samples", "samples", len(pendingSamples), "shard", shardNum)
|
||||||
|
n := len(pendingSamples)
|
||||||
|
s.sendSamples(ctx, pendingSamples)
|
||||||
pendingSamples = pendingSamples[:0]
|
pendingSamples = pendingSamples[:0]
|
||||||
|
s.qm.pendingSamplesMetric.Sub(float64(n))
|
||||||
}
|
}
|
||||||
timer.Reset(time.Duration(s.qm.cfg.BatchSendDeadline))
|
timer.Reset(time.Duration(s.qm.cfg.BatchSendDeadline))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *shards) sendSamples(samples model.Samples) {
|
func (s *shards) sendSamples(ctx context.Context, samples []prompb.TimeSeries) {
|
||||||
begin := time.Now()
|
begin := time.Now()
|
||||||
s.sendSamplesWithBackoff(samples)
|
err := s.sendSamplesWithBackoff(ctx, samples)
|
||||||
|
if err != nil && s.qm.logLimiter.Allow() {
|
||||||
|
level.Error(s.qm.logger).Log("msg", "non-recoverable error", "count", len(samples), "err", err)
|
||||||
|
failedSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples)))
|
||||||
|
}
|
||||||
|
|
||||||
// These counters are used to calculate the dynamic sharding, and as such
|
// These counters are used to calculate the dynamic sharding, and as such
|
||||||
// should be maintained irrespective of success or failure.
|
// should be maintained irrespective of success or failure.
|
||||||
|
@ -510,30 +712,67 @@ func (s *shards) sendSamples(samples model.Samples) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// sendSamples to the remote storage with backoff for recoverable errors.
|
// sendSamples to the remote storage with backoff for recoverable errors.
|
||||||
func (s *shards) sendSamplesWithBackoff(samples model.Samples) {
|
func (s *shards) sendSamplesWithBackoff(ctx context.Context, samples []prompb.TimeSeries) error {
|
||||||
backoff := s.qm.cfg.MinBackoff
|
backoff := s.qm.cfg.MinBackoff
|
||||||
req := ToWriteRequest(samples)
|
req, highest, err := buildWriteRequest(samples)
|
||||||
|
// Failing to build the write request is non-recoverable, since it will
|
||||||
for retries := s.qm.cfg.MaxRetries; retries > 0; retries-- {
|
// only error if marshaling the proto to bytes fails.
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
default:
|
||||||
|
}
|
||||||
begin := time.Now()
|
begin := time.Now()
|
||||||
err := s.qm.client.Store(s.ctx, req)
|
err := s.qm.client.Store(ctx, req)
|
||||||
|
|
||||||
sentBatchDuration.WithLabelValues(s.qm.queueName).Observe(time.Since(begin).Seconds())
|
sentBatchDuration.WithLabelValues(s.qm.queueName).Observe(time.Since(begin).Seconds())
|
||||||
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
succeededSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples)))
|
succeededSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples)))
|
||||||
return
|
now := time.Now()
|
||||||
|
s.qm.setLastSendTimestamp(now)
|
||||||
|
s.qm.setHighestSentTimestamp(highest)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
level.Warn(s.qm.logger).Log("msg", "Error sending samples to remote storage", "count", len(samples), "err", err)
|
|
||||||
if _, ok := err.(recoverableError); !ok {
|
if _, ok := err.(recoverableError); !ok {
|
||||||
break
|
return err
|
||||||
}
|
}
|
||||||
|
retriedSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples)))
|
||||||
|
|
||||||
|
if s.qm.logLimiter.Allow() {
|
||||||
|
level.Error(s.qm.logger).Log("err", err)
|
||||||
|
}
|
||||||
|
|
||||||
time.Sleep(time.Duration(backoff))
|
time.Sleep(time.Duration(backoff))
|
||||||
backoff = backoff * 2
|
backoff = backoff * 2
|
||||||
if backoff > s.qm.cfg.MaxBackoff {
|
if backoff > s.qm.cfg.MaxBackoff {
|
||||||
backoff = s.qm.cfg.MaxBackoff
|
backoff = s.qm.cfg.MaxBackoff
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
failedSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples)))
|
|
||||||
|
func buildWriteRequest(samples []prompb.TimeSeries) ([]byte, int64, error) {
|
||||||
|
var highest int64
|
||||||
|
for _, ts := range samples {
|
||||||
|
// At the moment we only ever append a TimeSeries with a single sample in it.
|
||||||
|
if ts.Samples[0].Timestamp > highest {
|
||||||
|
highest = ts.Samples[0].Timestamp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
req := &prompb.WriteRequest{
|
||||||
|
Timeseries: samples,
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := proto.Marshal(req)
|
||||||
|
if err != nil {
|
||||||
|
return nil, highest, err
|
||||||
|
}
|
||||||
|
|
||||||
|
compressed := snappy.Encode(nil, data)
|
||||||
|
return compressed, highest, nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,13 +22,229 @@ import (
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/gogo/protobuf/proto"
|
||||||
|
"github.com/golang/snappy"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
"github.com/prometheus/common/model"
|
"github.com/prometheus/common/model"
|
||||||
"github.com/prometheus/prometheus/config"
|
"github.com/prometheus/prometheus/config"
|
||||||
"github.com/prometheus/prometheus/prompb"
|
"github.com/prometheus/prometheus/prompb"
|
||||||
|
"github.com/prometheus/prometheus/util/testutil"
|
||||||
|
"github.com/prometheus/tsdb"
|
||||||
|
"github.com/prometheus/tsdb/labels"
|
||||||
)
|
)
|
||||||
|
|
||||||
const defaultFlushDeadline = 1 * time.Minute
|
const defaultFlushDeadline = 1 * time.Minute
|
||||||
|
|
||||||
|
func TestSampleDelivery(t *testing.T) {
|
||||||
|
// Let's create an even number of send batches so we don't run into the
|
||||||
|
// batch timeout case.
|
||||||
|
n := config.DefaultQueueConfig.Capacity * 2
|
||||||
|
samples, series := createTimeseries(n)
|
||||||
|
|
||||||
|
c := NewTestStorageClient()
|
||||||
|
c.expectSamples(samples[:len(samples)/2], series)
|
||||||
|
|
||||||
|
cfg := config.DefaultQueueConfig
|
||||||
|
cfg.BatchSendDeadline = model.Duration(100 * time.Millisecond)
|
||||||
|
cfg.MaxShards = 1
|
||||||
|
var temp int64
|
||||||
|
m := NewQueueManager(nil, "", newEWMARate(ewmaWeight, shardUpdateDuration), &temp, cfg, nil, nil, c, defaultFlushDeadline, 0)
|
||||||
|
m.seriesLabels = refSeriesToLabelsProto(series)
|
||||||
|
|
||||||
|
// These should be received by the client.
|
||||||
|
m.Start()
|
||||||
|
m.Append(samples[:len(samples)/2])
|
||||||
|
defer m.Stop()
|
||||||
|
|
||||||
|
c.waitForExpectedSamples(t)
|
||||||
|
m.Append(samples[len(samples)/2:])
|
||||||
|
c.expectSamples(samples[len(samples)/2:], series)
|
||||||
|
c.waitForExpectedSamples(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSampleDeliveryTimeout(t *testing.T) {
|
||||||
|
// Let's send one less sample than batch size, and wait the timeout duration
|
||||||
|
n := 9
|
||||||
|
samples, series := createTimeseries(n)
|
||||||
|
c := NewTestStorageClient()
|
||||||
|
|
||||||
|
cfg := config.DefaultQueueConfig
|
||||||
|
cfg.MaxShards = 1
|
||||||
|
cfg.BatchSendDeadline = model.Duration(100 * time.Millisecond)
|
||||||
|
var temp int64
|
||||||
|
m := NewQueueManager(nil, "", newEWMARate(ewmaWeight, shardUpdateDuration), &temp, cfg, nil, nil, c, defaultFlushDeadline, 0)
|
||||||
|
m.seriesLabels = refSeriesToLabelsProto(series)
|
||||||
|
m.Start()
|
||||||
|
defer m.Stop()
|
||||||
|
|
||||||
|
// Send the samples twice, waiting for the samples in the meantime.
|
||||||
|
c.expectSamples(samples, series)
|
||||||
|
m.Append(samples)
|
||||||
|
c.waitForExpectedSamples(t)
|
||||||
|
|
||||||
|
c.expectSamples(samples, series)
|
||||||
|
m.Append(samples)
|
||||||
|
c.waitForExpectedSamples(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSampleDeliveryOrder(t *testing.T) {
|
||||||
|
ts := 10
|
||||||
|
n := config.DefaultQueueConfig.MaxSamplesPerSend * ts
|
||||||
|
samples := make([]tsdb.RefSample, 0, n)
|
||||||
|
series := make([]tsdb.RefSeries, 0, n)
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
name := fmt.Sprintf("test_metric_%d", i%ts)
|
||||||
|
samples = append(samples, tsdb.RefSample{
|
||||||
|
Ref: uint64(i),
|
||||||
|
T: int64(i),
|
||||||
|
V: float64(i),
|
||||||
|
})
|
||||||
|
series = append(series, tsdb.RefSeries{
|
||||||
|
Ref: uint64(i),
|
||||||
|
Labels: labels.Labels{labels.Label{Name: "__name__", Value: name}},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
c := NewTestStorageClient()
|
||||||
|
c.expectSamples(samples, series)
|
||||||
|
var temp int64
|
||||||
|
m := NewQueueManager(nil, "", newEWMARate(ewmaWeight, shardUpdateDuration), &temp, config.DefaultQueueConfig, nil, nil, c, defaultFlushDeadline, 0)
|
||||||
|
m.seriesLabels = refSeriesToLabelsProto(series)
|
||||||
|
|
||||||
|
m.Start()
|
||||||
|
defer m.Stop()
|
||||||
|
// These should be received by the client.
|
||||||
|
m.Append(samples)
|
||||||
|
c.waitForExpectedSamples(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestShutdown(t *testing.T) {
|
||||||
|
deadline := 5 * time.Second
|
||||||
|
c := NewTestBlockedStorageClient()
|
||||||
|
|
||||||
|
var temp int64
|
||||||
|
m := NewQueueManager(nil, "", newEWMARate(ewmaWeight, shardUpdateDuration), &temp, config.DefaultQueueConfig, nil, nil, c, deadline, 0)
|
||||||
|
samples, series := createTimeseries(2 * config.DefaultQueueConfig.MaxSamplesPerSend)
|
||||||
|
m.seriesLabels = refSeriesToLabelsProto(series)
|
||||||
|
m.Start()
|
||||||
|
|
||||||
|
// Append blocks to guarantee delivery, so we do it in the background.
|
||||||
|
go func() {
|
||||||
|
m.Append(samples)
|
||||||
|
}()
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
|
||||||
|
// Test to ensure that Stop doesn't block.
|
||||||
|
start := time.Now()
|
||||||
|
m.Stop()
|
||||||
|
// The samples will never be delivered, so duration should
|
||||||
|
// be at least equal to deadline, otherwise the flush deadline
|
||||||
|
// was not respected.
|
||||||
|
duration := time.Since(start)
|
||||||
|
if duration > time.Duration(deadline+(deadline/10)) {
|
||||||
|
t.Errorf("Took too long to shutdown: %s > %s", duration, deadline)
|
||||||
|
}
|
||||||
|
if duration < time.Duration(deadline) {
|
||||||
|
t.Errorf("Shutdown occurred before flush deadline: %s < %s", duration, deadline)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSeriesReset(t *testing.T) {
|
||||||
|
c := NewTestBlockedStorageClient()
|
||||||
|
deadline := 5 * time.Second
|
||||||
|
var temp int64
|
||||||
|
numSegments := 4
|
||||||
|
numSeries := 25
|
||||||
|
|
||||||
|
m := NewQueueManager(nil, "", newEWMARate(ewmaWeight, shardUpdateDuration), &temp, config.DefaultQueueConfig, nil, nil, c, deadline, 0)
|
||||||
|
for i := 0; i < numSegments; i++ {
|
||||||
|
series := []tsdb.RefSeries{}
|
||||||
|
for j := 0; j < numSeries; j++ {
|
||||||
|
series = append(series, tsdb.RefSeries{Ref: uint64((i * 100) + j), Labels: labels.Labels{labels.Label{Name: "a", Value: "a"}}})
|
||||||
|
}
|
||||||
|
m.StoreSeries(series, i)
|
||||||
|
}
|
||||||
|
testutil.Equals(t, numSegments*numSeries, len(m.seriesLabels))
|
||||||
|
m.SeriesReset(2)
|
||||||
|
testutil.Equals(t, numSegments*numSeries/2, len(m.seriesLabels))
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReshard(t *testing.T) {
|
||||||
|
size := 10 // Make bigger to find more races.
|
||||||
|
n := config.DefaultQueueConfig.Capacity * size
|
||||||
|
samples, series := createTimeseries(n)
|
||||||
|
|
||||||
|
c := NewTestStorageClient()
|
||||||
|
c.expectSamples(samples, series)
|
||||||
|
|
||||||
|
cfg := config.DefaultQueueConfig
|
||||||
|
cfg.MaxShards = 1
|
||||||
|
|
||||||
|
var temp int64
|
||||||
|
m := NewQueueManager(nil, "", newEWMARate(ewmaWeight, shardUpdateDuration), &temp, cfg, nil, nil, c, defaultFlushDeadline, 0)
|
||||||
|
m.seriesLabels = refSeriesToLabelsProto(series)
|
||||||
|
|
||||||
|
m.Start()
|
||||||
|
defer m.Stop()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
for i := 0; i < len(samples); i += config.DefaultQueueConfig.Capacity {
|
||||||
|
sent := m.Append(samples[i : i+config.DefaultQueueConfig.Capacity])
|
||||||
|
require.True(t, sent)
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
for i := 1; i < len(samples)/config.DefaultQueueConfig.Capacity; i++ {
|
||||||
|
m.shards.stop()
|
||||||
|
m.shards.start(i)
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
}
|
||||||
|
|
||||||
|
c.waitForExpectedSamples(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
func createTimeseries(n int) ([]tsdb.RefSample, []tsdb.RefSeries) {
|
||||||
|
samples := make([]tsdb.RefSample, 0, n)
|
||||||
|
series := make([]tsdb.RefSeries, 0, n)
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
name := fmt.Sprintf("test_metric_%d", i)
|
||||||
|
samples = append(samples, tsdb.RefSample{
|
||||||
|
Ref: uint64(i),
|
||||||
|
T: int64(i),
|
||||||
|
V: float64(i),
|
||||||
|
})
|
||||||
|
series = append(series, tsdb.RefSeries{
|
||||||
|
Ref: uint64(i),
|
||||||
|
Labels: labels.Labels{labels.Label{Name: "__name__", Value: name}},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return samples, series
|
||||||
|
}
|
||||||
|
|
||||||
|
func getSeriesNameFromRef(r tsdb.RefSeries) string {
|
||||||
|
for _, l := range r.Labels {
|
||||||
|
if l.Name == "__name__" {
|
||||||
|
return l.Value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func refSeriesToLabelsProto(series []tsdb.RefSeries) map[uint64][]prompb.Label {
|
||||||
|
result := make(map[uint64][]prompb.Label)
|
||||||
|
for _, s := range series {
|
||||||
|
for _, l := range s.Labels {
|
||||||
|
result[s.Ref] = append(result[s.Ref], prompb.Label{
|
||||||
|
Name: l.Name,
|
||||||
|
Value: l.Value,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
type TestStorageClient struct {
|
type TestStorageClient struct {
|
||||||
receivedSamples map[string][]prompb.Sample
|
receivedSamples map[string][]prompb.Sample
|
||||||
expectedSamples map[string][]prompb.Sample
|
expectedSamples map[string][]prompb.Sample
|
||||||
|
@ -43,7 +259,7 @@ func NewTestStorageClient() *TestStorageClient {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *TestStorageClient) expectSamples(ss model.Samples) {
|
func (c *TestStorageClient) expectSamples(ss []tsdb.RefSample, series []tsdb.RefSeries) {
|
||||||
c.mtx.Lock()
|
c.mtx.Lock()
|
||||||
defer c.mtx.Unlock()
|
defer c.mtx.Unlock()
|
||||||
|
|
||||||
|
@ -51,10 +267,10 @@ func (c *TestStorageClient) expectSamples(ss model.Samples) {
|
||||||
c.receivedSamples = map[string][]prompb.Sample{}
|
c.receivedSamples = map[string][]prompb.Sample{}
|
||||||
|
|
||||||
for _, s := range ss {
|
for _, s := range ss {
|
||||||
ts := labelProtosToLabels(MetricToLabelProtos(s.Metric)).String()
|
seriesName := getSeriesNameFromRef(series[s.Ref])
|
||||||
c.expectedSamples[ts] = append(c.expectedSamples[ts], prompb.Sample{
|
c.expectedSamples[seriesName] = append(c.expectedSamples[seriesName], prompb.Sample{
|
||||||
Timestamp: int64(s.Timestamp),
|
Timestamp: s.T,
|
||||||
Value: float64(s.Value),
|
Value: s.V,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
c.wg.Add(len(ss))
|
c.wg.Add(len(ss))
|
||||||
|
@ -62,7 +278,6 @@ func (c *TestStorageClient) expectSamples(ss model.Samples) {
|
||||||
|
|
||||||
func (c *TestStorageClient) waitForExpectedSamples(t *testing.T) {
|
func (c *TestStorageClient) waitForExpectedSamples(t *testing.T) {
|
||||||
c.wg.Wait()
|
c.wg.Wait()
|
||||||
|
|
||||||
c.mtx.Lock()
|
c.mtx.Lock()
|
||||||
defer c.mtx.Unlock()
|
defer c.mtx.Unlock()
|
||||||
for ts, expectedSamples := range c.expectedSamples {
|
for ts, expectedSamples := range c.expectedSamples {
|
||||||
|
@ -72,15 +287,31 @@ func (c *TestStorageClient) waitForExpectedSamples(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *TestStorageClient) Store(_ context.Context, req *prompb.WriteRequest) error {
|
func (c *TestStorageClient) Store(_ context.Context, req []byte) error {
|
||||||
c.mtx.Lock()
|
c.mtx.Lock()
|
||||||
defer c.mtx.Unlock()
|
defer c.mtx.Unlock()
|
||||||
|
reqBuf, err := snappy.Decode(nil, req)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var reqProto prompb.WriteRequest
|
||||||
|
if err := proto.Unmarshal(reqBuf, &reqProto); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
count := 0
|
count := 0
|
||||||
for _, ts := range req.Timeseries {
|
for _, ts := range reqProto.Timeseries {
|
||||||
labels := labelProtosToLabels(ts.Labels).String()
|
var seriesName string
|
||||||
|
labels := labelProtosToLabels(ts.Labels)
|
||||||
|
for _, label := range labels {
|
||||||
|
if label.Name == "__name__" {
|
||||||
|
seriesName = label.Value
|
||||||
|
}
|
||||||
|
}
|
||||||
for _, sample := range ts.Samples {
|
for _, sample := range ts.Samples {
|
||||||
count++
|
count++
|
||||||
c.receivedSamples[labels] = append(c.receivedSamples[labels], sample)
|
c.receivedSamples[seriesName] = append(c.receivedSamples[seriesName], sample)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
c.wg.Add(-count)
|
c.wg.Add(-count)
|
||||||
|
@ -91,133 +322,21 @@ func (c *TestStorageClient) Name() string {
|
||||||
return "teststorageclient"
|
return "teststorageclient"
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestSampleDelivery(t *testing.T) {
|
|
||||||
// Let's create an even number of send batches so we don't run into the
|
|
||||||
// batch timeout case.
|
|
||||||
n := config.DefaultQueueConfig.Capacity * 2
|
|
||||||
|
|
||||||
samples := make(model.Samples, 0, n)
|
|
||||||
for i := 0; i < n; i++ {
|
|
||||||
name := model.LabelValue(fmt.Sprintf("test_metric_%d", i))
|
|
||||||
samples = append(samples, &model.Sample{
|
|
||||||
Metric: model.Metric{
|
|
||||||
model.MetricNameLabel: name,
|
|
||||||
},
|
|
||||||
Value: model.SampleValue(i),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
c := NewTestStorageClient()
|
|
||||||
c.expectSamples(samples[:len(samples)/2])
|
|
||||||
|
|
||||||
cfg := config.DefaultQueueConfig
|
|
||||||
cfg.MaxShards = 1
|
|
||||||
m := NewQueueManager(nil, cfg, nil, nil, c, defaultFlushDeadline)
|
|
||||||
|
|
||||||
// These should be received by the client.
|
|
||||||
for _, s := range samples[:len(samples)/2] {
|
|
||||||
m.Append(s)
|
|
||||||
}
|
|
||||||
// These will be dropped because the queue is full.
|
|
||||||
for _, s := range samples[len(samples)/2:] {
|
|
||||||
m.Append(s)
|
|
||||||
}
|
|
||||||
m.Start()
|
|
||||||
defer m.Stop()
|
|
||||||
|
|
||||||
c.waitForExpectedSamples(t)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestSampleDeliveryTimeout(t *testing.T) {
|
|
||||||
// Let's send one less sample than batch size, and wait the timeout duration
|
|
||||||
n := config.DefaultQueueConfig.Capacity - 1
|
|
||||||
|
|
||||||
samples := make(model.Samples, 0, n)
|
|
||||||
for i := 0; i < n; i++ {
|
|
||||||
name := model.LabelValue(fmt.Sprintf("test_metric_%d", i))
|
|
||||||
samples = append(samples, &model.Sample{
|
|
||||||
Metric: model.Metric{
|
|
||||||
model.MetricNameLabel: name,
|
|
||||||
},
|
|
||||||
Value: model.SampleValue(i),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
c := NewTestStorageClient()
|
|
||||||
|
|
||||||
cfg := config.DefaultQueueConfig
|
|
||||||
cfg.MaxShards = 1
|
|
||||||
cfg.BatchSendDeadline = model.Duration(100 * time.Millisecond)
|
|
||||||
m := NewQueueManager(nil, cfg, nil, nil, c, defaultFlushDeadline)
|
|
||||||
m.Start()
|
|
||||||
defer m.Stop()
|
|
||||||
|
|
||||||
// Send the samples twice, waiting for the samples in the meantime.
|
|
||||||
c.expectSamples(samples)
|
|
||||||
for _, s := range samples {
|
|
||||||
m.Append(s)
|
|
||||||
}
|
|
||||||
c.waitForExpectedSamples(t)
|
|
||||||
|
|
||||||
c.expectSamples(samples)
|
|
||||||
for _, s := range samples {
|
|
||||||
m.Append(s)
|
|
||||||
}
|
|
||||||
c.waitForExpectedSamples(t)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestSampleDeliveryOrder(t *testing.T) {
|
|
||||||
ts := 10
|
|
||||||
n := config.DefaultQueueConfig.MaxSamplesPerSend * ts
|
|
||||||
|
|
||||||
samples := make(model.Samples, 0, n)
|
|
||||||
for i := 0; i < n; i++ {
|
|
||||||
name := model.LabelValue(fmt.Sprintf("test_metric_%d", i%ts))
|
|
||||||
samples = append(samples, &model.Sample{
|
|
||||||
Metric: model.Metric{
|
|
||||||
model.MetricNameLabel: name,
|
|
||||||
},
|
|
||||||
Value: model.SampleValue(i),
|
|
||||||
Timestamp: model.Time(i),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
c := NewTestStorageClient()
|
|
||||||
c.expectSamples(samples)
|
|
||||||
m := NewQueueManager(nil, config.DefaultQueueConfig, nil, nil, c, defaultFlushDeadline)
|
|
||||||
|
|
||||||
// These should be received by the client.
|
|
||||||
for _, s := range samples {
|
|
||||||
m.Append(s)
|
|
||||||
}
|
|
||||||
m.Start()
|
|
||||||
defer m.Stop()
|
|
||||||
|
|
||||||
c.waitForExpectedSamples(t)
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestBlockingStorageClient is a queue_manager StorageClient which will block
|
// TestBlockingStorageClient is a queue_manager StorageClient which will block
|
||||||
// on any calls to Store(), until the `block` channel is closed, at which point
|
// on any calls to Store(), until the request's Context is cancelled, at which
|
||||||
// the `numCalls` property will contain a count of how many times Store() was
|
// point the `numCalls` property will contain a count of how many times Store()
|
||||||
// called.
|
// was called.
|
||||||
type TestBlockingStorageClient struct {
|
type TestBlockingStorageClient struct {
|
||||||
numCalls uint64
|
numCalls uint64
|
||||||
block chan bool
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewTestBlockedStorageClient() *TestBlockingStorageClient {
|
func NewTestBlockedStorageClient() *TestBlockingStorageClient {
|
||||||
return &TestBlockingStorageClient{
|
return &TestBlockingStorageClient{}
|
||||||
block: make(chan bool),
|
|
||||||
numCalls: 0,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *TestBlockingStorageClient) Store(ctx context.Context, _ *prompb.WriteRequest) error {
|
func (c *TestBlockingStorageClient) Store(ctx context.Context, _ []byte) error {
|
||||||
atomic.AddUint64(&c.numCalls, 1)
|
atomic.AddUint64(&c.numCalls, 1)
|
||||||
select {
|
<-ctx.Done()
|
||||||
case <-c.block:
|
|
||||||
case <-ctx.Done():
|
|
||||||
}
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -225,106 +344,6 @@ func (c *TestBlockingStorageClient) NumCalls() uint64 {
|
||||||
return atomic.LoadUint64(&c.numCalls)
|
return atomic.LoadUint64(&c.numCalls)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *TestBlockingStorageClient) unlock() {
|
|
||||||
close(c.block)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *TestBlockingStorageClient) Name() string {
|
func (c *TestBlockingStorageClient) Name() string {
|
||||||
return "testblockingstorageclient"
|
return "testblockingstorageclient"
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *QueueManager) queueLen() int {
|
|
||||||
t.shardsMtx.Lock()
|
|
||||||
defer t.shardsMtx.Unlock()
|
|
||||||
queueLength := 0
|
|
||||||
for _, shard := range t.shards.queues {
|
|
||||||
queueLength += len(shard)
|
|
||||||
}
|
|
||||||
return queueLength
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestSpawnNotMoreThanMaxConcurrentSendsGoroutines(t *testing.T) {
|
|
||||||
// Our goal is to fully empty the queue:
|
|
||||||
// `MaxSamplesPerSend*Shards` samples should be consumed by the
|
|
||||||
// per-shard goroutines, and then another `MaxSamplesPerSend`
|
|
||||||
// should be left on the queue.
|
|
||||||
n := config.DefaultQueueConfig.MaxSamplesPerSend * 2
|
|
||||||
|
|
||||||
samples := make(model.Samples, 0, n)
|
|
||||||
for i := 0; i < n; i++ {
|
|
||||||
name := model.LabelValue(fmt.Sprintf("test_metric_%d", i))
|
|
||||||
samples = append(samples, &model.Sample{
|
|
||||||
Metric: model.Metric{
|
|
||||||
model.MetricNameLabel: name,
|
|
||||||
},
|
|
||||||
Value: model.SampleValue(i),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
c := NewTestBlockedStorageClient()
|
|
||||||
cfg := config.DefaultQueueConfig
|
|
||||||
cfg.MaxShards = 1
|
|
||||||
cfg.Capacity = n
|
|
||||||
m := NewQueueManager(nil, cfg, nil, nil, c, defaultFlushDeadline)
|
|
||||||
|
|
||||||
m.Start()
|
|
||||||
|
|
||||||
defer func() {
|
|
||||||
c.unlock()
|
|
||||||
m.Stop()
|
|
||||||
}()
|
|
||||||
|
|
||||||
for _, s := range samples {
|
|
||||||
m.Append(s)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wait until the runShard() loops drain the queue. If things went right, it
|
|
||||||
// should then immediately block in sendSamples(), but, in case of error,
|
|
||||||
// it would spawn too many goroutines, and thus we'd see more calls to
|
|
||||||
// client.Store()
|
|
||||||
//
|
|
||||||
// The timed wait is maybe non-ideal, but, in order to verify that we're
|
|
||||||
// not spawning too many concurrent goroutines, we have to wait on the
|
|
||||||
// Run() loop to consume a specific number of elements from the
|
|
||||||
// queue... and it doesn't signal that in any obvious way, except by
|
|
||||||
// draining the queue. We cap the waiting at 1 second -- that should give
|
|
||||||
// plenty of time, and keeps the failure fairly quick if we're not draining
|
|
||||||
// the queue properly.
|
|
||||||
for i := 0; i < 100 && m.queueLen() > 0; i++ {
|
|
||||||
time.Sleep(10 * time.Millisecond)
|
|
||||||
}
|
|
||||||
|
|
||||||
if m.queueLen() != config.DefaultQueueConfig.MaxSamplesPerSend {
|
|
||||||
t.Fatalf("Failed to drain QueueManager queue, %d elements left",
|
|
||||||
m.queueLen(),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
numCalls := c.NumCalls()
|
|
||||||
if numCalls != uint64(1) {
|
|
||||||
t.Errorf("Saw %d concurrent sends, expected 1", numCalls)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestShutdown(t *testing.T) {
|
|
||||||
deadline := 10 * time.Second
|
|
||||||
c := NewTestBlockedStorageClient()
|
|
||||||
m := NewQueueManager(nil, config.DefaultQueueConfig, nil, nil, c, deadline)
|
|
||||||
for i := 0; i < config.DefaultQueueConfig.MaxSamplesPerSend; i++ {
|
|
||||||
m.Append(&model.Sample{
|
|
||||||
Metric: model.Metric{
|
|
||||||
model.MetricNameLabel: model.LabelValue(fmt.Sprintf("test_metric_%d", i)),
|
|
||||||
},
|
|
||||||
Value: model.SampleValue(i),
|
|
||||||
Timestamp: model.Time(i),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
m.Start()
|
|
||||||
|
|
||||||
start := time.Now()
|
|
||||||
m.Stop()
|
|
||||||
duration := time.Since(start)
|
|
||||||
if duration > deadline+(deadline/10) {
|
|
||||||
t.Errorf("Took too long to shutdown: %s > %s", duration, deadline)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -19,9 +19,12 @@ import (
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/go-kit/kit/log"
|
"github.com/go-kit/kit/log"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/prometheus/common/model"
|
"github.com/prometheus/common/model"
|
||||||
"github.com/prometheus/prometheus/config"
|
"github.com/prometheus/prometheus/config"
|
||||||
"github.com/prometheus/prometheus/pkg/labels"
|
"github.com/prometheus/prometheus/pkg/labels"
|
||||||
|
"github.com/prometheus/prometheus/pkg/timestamp"
|
||||||
"github.com/prometheus/prometheus/storage"
|
"github.com/prometheus/prometheus/storage"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -35,7 +38,13 @@ type Storage struct {
|
||||||
mtx sync.RWMutex
|
mtx sync.RWMutex
|
||||||
|
|
||||||
// For writes
|
// For writes
|
||||||
queues []*QueueManager
|
walDir string
|
||||||
|
queues []*QueueManager
|
||||||
|
samplesIn *ewmaRate
|
||||||
|
samplesInMetric prometheus.Counter
|
||||||
|
highestTimestampMtx sync.Mutex
|
||||||
|
highestTimestamp int64
|
||||||
|
highestTimestampMetric prometheus.Gauge
|
||||||
|
|
||||||
// For reads
|
// For reads
|
||||||
queryables []storage.Queryable
|
queryables []storage.Queryable
|
||||||
|
@ -44,15 +53,30 @@ type Storage struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewStorage returns a remote.Storage.
|
// NewStorage returns a remote.Storage.
|
||||||
func NewStorage(l log.Logger, stCallback startTimeCallback, flushDeadline time.Duration) *Storage {
|
func NewStorage(l log.Logger, reg prometheus.Registerer, stCallback startTimeCallback, walDir string, flushDeadline time.Duration) *Storage {
|
||||||
if l == nil {
|
if l == nil {
|
||||||
l = log.NewNopLogger()
|
l = log.NewNopLogger()
|
||||||
}
|
}
|
||||||
return &Storage{
|
shardUpdateDuration := 10 * time.Second
|
||||||
|
s := &Storage{
|
||||||
logger: l,
|
logger: l,
|
||||||
localStartTimeCallback: stCallback,
|
localStartTimeCallback: stCallback,
|
||||||
flushDeadline: flushDeadline,
|
flushDeadline: flushDeadline,
|
||||||
|
walDir: walDir,
|
||||||
|
// queues: make(map[*QueueManager]struct{}),
|
||||||
|
samplesIn: newEWMARate(ewmaWeight, shardUpdateDuration),
|
||||||
|
samplesInMetric: prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Name: "prometheus_remote_storage_samples_in_total",
|
||||||
|
Help: "Samples in to remote storage, compare to samples out for queue managers.",
|
||||||
|
}),
|
||||||
|
highestTimestampMetric: prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
|
Name: "prometheus_remote_storage_highest_timestamp_in",
|
||||||
|
Help: "Highest timestamp that has come into the remote storage via the Appender interface.",
|
||||||
|
}),
|
||||||
}
|
}
|
||||||
|
reg.MustRegister(s.samplesInMetric)
|
||||||
|
reg.MustRegister(s.highestTimestampMetric)
|
||||||
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
// ApplyConfig updates the state as the new config requires.
|
// ApplyConfig updates the state as the new config requires.
|
||||||
|
@ -61,7 +85,6 @@ func (s *Storage) ApplyConfig(conf *config.Config) error {
|
||||||
defer s.mtx.Unlock()
|
defer s.mtx.Unlock()
|
||||||
|
|
||||||
// Update write queues
|
// Update write queues
|
||||||
|
|
||||||
newQueues := []*QueueManager{}
|
newQueues := []*QueueManager{}
|
||||||
// TODO: we should only stop & recreate queues which have changes,
|
// TODO: we should only stop & recreate queues which have changes,
|
||||||
// as this can be quite disruptive.
|
// as this can be quite disruptive.
|
||||||
|
@ -74,13 +97,20 @@ func (s *Storage) ApplyConfig(conf *config.Config) error {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
// Convert to int64 for comparison with timestamps from samples
|
||||||
|
// we will eventually read from the WAL on startup.
|
||||||
|
startTime := timestamp.FromTime(time.Now())
|
||||||
newQueues = append(newQueues, NewQueueManager(
|
newQueues = append(newQueues, NewQueueManager(
|
||||||
s.logger,
|
s.logger,
|
||||||
|
s.walDir,
|
||||||
|
s.samplesIn,
|
||||||
|
&s.highestTimestamp,
|
||||||
rwConf.QueueConfig,
|
rwConf.QueueConfig,
|
||||||
conf.GlobalConfig.ExternalLabels,
|
conf.GlobalConfig.ExternalLabels,
|
||||||
rwConf.WriteRelabelConfigs,
|
rwConf.WriteRelabelConfigs,
|
||||||
c,
|
c,
|
||||||
s.flushDeadline,
|
s.flushDeadline,
|
||||||
|
startTime,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
521
storage/remote/wal_watcher.go
Normal file
521
storage/remote/wal_watcher.go
Normal file
|
@ -0,0 +1,521 @@
|
||||||
|
// Copyright 2018 The Prometheus Authors
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package remote
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"path"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/go-kit/kit/log"
|
||||||
|
"github.com/go-kit/kit/log/level"
|
||||||
|
"github.com/pkg/errors"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/tsdb"
|
||||||
|
"github.com/prometheus/tsdb/fileutil"
|
||||||
|
"github.com/prometheus/tsdb/wal"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
readPeriod = 10 * time.Millisecond
|
||||||
|
checkpointPeriod = 5 * time.Second
|
||||||
|
segmentCheckPeriod = 100 * time.Millisecond
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
watcherSamplesRecordsRead = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: "prometheus",
|
||||||
|
Subsystem: "wal_watcher",
|
||||||
|
Name: "samples_records_read_total",
|
||||||
|
Help: "Number of samples records read by the WAL watcher from the WAL.",
|
||||||
|
},
|
||||||
|
[]string{queue},
|
||||||
|
)
|
||||||
|
watcherSeriesRecordsRead = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: "prometheus",
|
||||||
|
Subsystem: "wal_watcher",
|
||||||
|
Name: "series_records_read_total",
|
||||||
|
Help: "Number of series records read by the WAL watcher from the WAL.",
|
||||||
|
},
|
||||||
|
[]string{queue},
|
||||||
|
)
|
||||||
|
watcherTombstoneRecordsRead = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: "prometheus",
|
||||||
|
Subsystem: "wal_watcher",
|
||||||
|
Name: "tombstone_records_read_total",
|
||||||
|
Help: "Number of tombstone records read by the WAL watcher from the WAL.",
|
||||||
|
},
|
||||||
|
[]string{queue},
|
||||||
|
)
|
||||||
|
watcherInvalidRecordsRead = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: "prometheus",
|
||||||
|
Subsystem: "wal_watcher",
|
||||||
|
Name: "invalid_records_read_total",
|
||||||
|
Help: "Number of invalid records read by the WAL watcher from the WAL.",
|
||||||
|
},
|
||||||
|
[]string{queue},
|
||||||
|
)
|
||||||
|
watcherUnknownTypeRecordsRead = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: "prometheus",
|
||||||
|
Subsystem: "wal_watcher",
|
||||||
|
Name: "unknown_records_read_total",
|
||||||
|
Help: "Number of records read by the WAL watcher from the WAL of an unknown record type.",
|
||||||
|
},
|
||||||
|
[]string{queue},
|
||||||
|
)
|
||||||
|
watcherRecordDecodeFails = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: "prometheus",
|
||||||
|
Subsystem: "wal_watcher",
|
||||||
|
Name: "record_decode_failures_total",
|
||||||
|
Help: "Number of records read by the WAL watcher that resulted in an error when decoding.",
|
||||||
|
},
|
||||||
|
[]string{queue},
|
||||||
|
)
|
||||||
|
watcherSamplesSentPreTailing = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: "prometheus",
|
||||||
|
Subsystem: "wal_watcher",
|
||||||
|
Name: "samples_sent_pre_tailing_total",
|
||||||
|
Help: "Number of sample records read by the WAL watcher and sent to remote write during replay of existing WAL.",
|
||||||
|
},
|
||||||
|
[]string{queue},
|
||||||
|
)
|
||||||
|
watcherCurrentSegment = prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: "prometheus",
|
||||||
|
Subsystem: "wal_watcher",
|
||||||
|
Name: "current_segment",
|
||||||
|
Help: "Current segment the WAL watcher is reading records from.",
|
||||||
|
},
|
||||||
|
[]string{queue},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
prometheus.MustRegister(watcherSamplesRecordsRead)
|
||||||
|
prometheus.MustRegister(watcherSeriesRecordsRead)
|
||||||
|
prometheus.MustRegister(watcherTombstoneRecordsRead)
|
||||||
|
prometheus.MustRegister(watcherInvalidRecordsRead)
|
||||||
|
prometheus.MustRegister(watcherUnknownTypeRecordsRead)
|
||||||
|
prometheus.MustRegister(watcherRecordDecodeFails)
|
||||||
|
prometheus.MustRegister(watcherSamplesSentPreTailing)
|
||||||
|
prometheus.MustRegister(watcherCurrentSegment)
|
||||||
|
}
|
||||||
|
|
||||||
|
type writeTo interface {
|
||||||
|
Append([]tsdb.RefSample) bool
|
||||||
|
StoreSeries([]tsdb.RefSeries, int)
|
||||||
|
SeriesReset(int)
|
||||||
|
}
|
||||||
|
|
||||||
|
// WALWatcher watches the TSDB WAL for a given WriteTo.
|
||||||
|
type WALWatcher struct {
|
||||||
|
name string
|
||||||
|
writer writeTo
|
||||||
|
logger log.Logger
|
||||||
|
walDir string
|
||||||
|
|
||||||
|
currentSegment int
|
||||||
|
lastCheckpoint string
|
||||||
|
startTime int64
|
||||||
|
|
||||||
|
samplesReadMetric prometheus.Counter
|
||||||
|
seriesReadMetric prometheus.Counter
|
||||||
|
tombstonesReadMetric prometheus.Counter
|
||||||
|
invalidReadMetric prometheus.Counter
|
||||||
|
unknownReadMetric prometheus.Counter
|
||||||
|
recordDecodeFailsMetric prometheus.Counter
|
||||||
|
samplesSentPreTailing prometheus.Counter
|
||||||
|
currentSegmentMetric prometheus.Gauge
|
||||||
|
|
||||||
|
quit chan struct{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewWALWatcher creates a new WAL watcher for a given WriteTo.
|
||||||
|
func NewWALWatcher(logger log.Logger, name string, writer writeTo, walDir string, startTime int64) *WALWatcher {
|
||||||
|
if logger == nil {
|
||||||
|
logger = log.NewNopLogger()
|
||||||
|
}
|
||||||
|
w := &WALWatcher{
|
||||||
|
logger: logger,
|
||||||
|
writer: writer,
|
||||||
|
walDir: path.Join(walDir, "wal"),
|
||||||
|
startTime: startTime,
|
||||||
|
name: name,
|
||||||
|
quit: make(chan struct{}),
|
||||||
|
}
|
||||||
|
|
||||||
|
w.samplesReadMetric = watcherSamplesRecordsRead.WithLabelValues(w.name)
|
||||||
|
w.seriesReadMetric = watcherSeriesRecordsRead.WithLabelValues(w.name)
|
||||||
|
w.tombstonesReadMetric = watcherTombstoneRecordsRead.WithLabelValues(w.name)
|
||||||
|
w.unknownReadMetric = watcherUnknownTypeRecordsRead.WithLabelValues(w.name)
|
||||||
|
w.invalidReadMetric = watcherInvalidRecordsRead.WithLabelValues(w.name)
|
||||||
|
w.recordDecodeFailsMetric = watcherRecordDecodeFails.WithLabelValues(w.name)
|
||||||
|
w.samplesSentPreTailing = watcherSamplesSentPreTailing.WithLabelValues(w.name)
|
||||||
|
w.currentSegmentMetric = watcherCurrentSegment.WithLabelValues(w.name)
|
||||||
|
|
||||||
|
return w
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *WALWatcher) Start() {
|
||||||
|
level.Info(w.logger).Log("msg", "starting WAL watcher", "queue", w.name)
|
||||||
|
go w.runWatcher()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *WALWatcher) Stop() {
|
||||||
|
level.Info(w.logger).Log("msg", "stopping WAL watcher", "queue", w.name)
|
||||||
|
close(w.quit)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *WALWatcher) runWatcher() {
|
||||||
|
// The WAL dir may not exist when Prometheus first starts up.
|
||||||
|
for {
|
||||||
|
if _, err := os.Stat(w.walDir); os.IsNotExist(err) {
|
||||||
|
time.Sleep(time.Second)
|
||||||
|
} else {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
nw, err := wal.New(nil, nil, w.walDir)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(w.logger).Log("err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
first, last, err := nw.Segments()
|
||||||
|
if err != nil {
|
||||||
|
level.Error(w.logger).Log("err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if last == -1 {
|
||||||
|
level.Error(w.logger).Log("err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Backfill from the checkpoint first if it exists.
|
||||||
|
dir, _, err := tsdb.LastCheckpoint(w.walDir)
|
||||||
|
if err != nil && err != tsdb.ErrNotFound {
|
||||||
|
level.Error(w.logger).Log("msg", "error looking for existing checkpoint, some samples may be dropped", "err", errors.Wrap(err, "find last checkpoint"))
|
||||||
|
}
|
||||||
|
|
||||||
|
level.Debug(w.logger).Log("msg", "reading checkpoint", "dir", dir)
|
||||||
|
if err == nil {
|
||||||
|
w.lastCheckpoint = dir
|
||||||
|
err = w.readCheckpoint(dir)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(w.logger).Log("msg", "error reading existing checkpoint, some samples may be dropped", "err", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
w.currentSegment = first
|
||||||
|
w.currentSegmentMetric.Set(float64(w.currentSegment))
|
||||||
|
segment, err := wal.OpenReadSegment(wal.SegmentName(w.walDir, w.currentSegment))
|
||||||
|
// TODO: callum, is this error really fatal?
|
||||||
|
if err != nil {
|
||||||
|
level.Error(w.logger).Log("err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
reader := wal.NewLiveReader(segment)
|
||||||
|
tail := false
|
||||||
|
|
||||||
|
for {
|
||||||
|
// If we've replayed the existing WAL, start tailing.
|
||||||
|
if w.currentSegment == last {
|
||||||
|
tail = true
|
||||||
|
}
|
||||||
|
if tail {
|
||||||
|
level.Info(w.logger).Log("msg", "watching segment", "segment", w.currentSegment)
|
||||||
|
} else {
|
||||||
|
level.Info(w.logger).Log("msg", "replaying segment", "segment", w.currentSegment)
|
||||||
|
}
|
||||||
|
|
||||||
|
// On start, after reading the existing WAL for series records, we have a pointer to what is the latest segment.
|
||||||
|
// On subsequent calls to this function, currentSegment will have been incremented and we should open that segment.
|
||||||
|
err := w.watch(nw, reader, tail)
|
||||||
|
segment.Close()
|
||||||
|
if err != nil {
|
||||||
|
level.Error(w.logger).Log("msg", "runWatcher is ending", "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
w.currentSegment++
|
||||||
|
w.currentSegmentMetric.Set(float64(w.currentSegment))
|
||||||
|
|
||||||
|
segment, err = wal.OpenReadSegment(wal.SegmentName(w.walDir, w.currentSegment))
|
||||||
|
// TODO: callum, is this error really fatal?
|
||||||
|
if err != nil {
|
||||||
|
level.Error(w.logger).Log("err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
reader = wal.NewLiveReader(segment)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use tail true to indicate that the reader is currently on a segment that is
|
||||||
|
// actively being written to. If false, assume it's a full segment and we're
|
||||||
|
// replaying it on start to cache the series records.
|
||||||
|
func (w *WALWatcher) watch(wl *wal.WAL, reader *wal.LiveReader, tail bool) error {
|
||||||
|
|
||||||
|
readTicker := time.NewTicker(readPeriod)
|
||||||
|
defer readTicker.Stop()
|
||||||
|
|
||||||
|
checkpointTicker := time.NewTicker(checkpointPeriod)
|
||||||
|
defer checkpointTicker.Stop()
|
||||||
|
|
||||||
|
segmentTicker := time.NewTicker(segmentCheckPeriod)
|
||||||
|
defer segmentTicker.Stop()
|
||||||
|
// If we're replaying the segment we need to know the size of the file to know
|
||||||
|
// when to return from watch and move on to the next segment.
|
||||||
|
size := int64(math.MaxInt64)
|
||||||
|
if !tail {
|
||||||
|
segmentTicker.Stop()
|
||||||
|
checkpointTicker.Stop()
|
||||||
|
var err error
|
||||||
|
size, err = getSegmentSize(w.walDir, w.currentSegment)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(w.logger).Log("msg", "error getting segment size", "segment", w.currentSegment)
|
||||||
|
return errors.Wrap(err, "get segment size")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-w.quit:
|
||||||
|
level.Info(w.logger).Log("msg", "quitting WAL watcher watch loop")
|
||||||
|
return errors.New("quit channel")
|
||||||
|
|
||||||
|
case <-checkpointTicker.C:
|
||||||
|
// Periodically check if there is a new checkpoint.
|
||||||
|
// As this is considered an optimisation, we ignore errors during
|
||||||
|
// checkpoint processing.
|
||||||
|
|
||||||
|
dir, _, err := tsdb.LastCheckpoint(w.walDir)
|
||||||
|
if err != nil && err != tsdb.ErrNotFound {
|
||||||
|
level.Error(w.logger).Log("msg", "error getting last checkpoint", "err", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if dir == w.lastCheckpoint {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
level.Info(w.logger).Log("msg", "new checkpoint detected", "last", w.lastCheckpoint, "new", dir)
|
||||||
|
|
||||||
|
d, err := checkpointNum(dir)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(w.logger).Log("msg", "error parsing checkpoint", "err", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if d >= w.currentSegment {
|
||||||
|
level.Info(w.logger).Log("msg", "current segment is behind the checkpoint, skipping reading of checkpoint", "current", fmt.Sprintf("%08d", w.currentSegment), "checkpoint", dir)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
w.lastCheckpoint = dir
|
||||||
|
// This potentially takes a long time, should we run it in another go routine?
|
||||||
|
err = w.readCheckpoint(w.lastCheckpoint)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(w.logger).Log("err", err)
|
||||||
|
}
|
||||||
|
// Clear series with a checkpoint or segment index # lower than the checkpoint we just read.
|
||||||
|
w.writer.SeriesReset(d)
|
||||||
|
|
||||||
|
case <-segmentTicker.C:
|
||||||
|
_, last, err := wl.Segments()
|
||||||
|
if err != nil {
|
||||||
|
return errors.Wrap(err, "segments")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if new segments exists.
|
||||||
|
if last <= w.currentSegment {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := w.readSegment(reader); err != nil {
|
||||||
|
// Ignore errors reading to end of segment, as we're going to move to
|
||||||
|
// next segment now.
|
||||||
|
level.Error(w.logger).Log("msg", "error reading to end of segment", "err", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
level.Info(w.logger).Log("msg", "a new segment exists, we should start reading it", "current", fmt.Sprintf("%08d", w.currentSegment), "new", fmt.Sprintf("%08d", last))
|
||||||
|
return nil
|
||||||
|
|
||||||
|
case <-readTicker.C:
|
||||||
|
if err := w.readSegment(reader); err != nil && err != io.EOF {
|
||||||
|
level.Error(w.logger).Log("err", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if reader.TotalRead() >= size && !tail {
|
||||||
|
level.Info(w.logger).Log("msg", "done replaying segment", "segment", w.currentSegment, "size", size, "read", reader.TotalRead())
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *WALWatcher) readSegment(r *wal.LiveReader) error {
|
||||||
|
for r.Next() && !isClosed(w.quit) {
|
||||||
|
err := w.decodeRecord(r.Record())
|
||||||
|
|
||||||
|
// Intentionally skip over record decode errors.
|
||||||
|
if err != nil {
|
||||||
|
level.Error(w.logger).Log("err", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return r.Err()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *WALWatcher) decodeRecord(rec []byte) error {
|
||||||
|
var (
|
||||||
|
dec tsdb.RecordDecoder
|
||||||
|
series []tsdb.RefSeries
|
||||||
|
samples []tsdb.RefSample
|
||||||
|
)
|
||||||
|
switch dec.Type(rec) {
|
||||||
|
case tsdb.RecordSeries:
|
||||||
|
series, err := dec.Series(rec, series[:0])
|
||||||
|
if err != nil {
|
||||||
|
w.recordDecodeFailsMetric.Inc()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
w.seriesReadMetric.Add(float64(len(series)))
|
||||||
|
w.writer.StoreSeries(series, w.currentSegment)
|
||||||
|
|
||||||
|
case tsdb.RecordSamples:
|
||||||
|
samples, err := dec.Samples(rec, samples[:0])
|
||||||
|
if err != nil {
|
||||||
|
w.recordDecodeFailsMetric.Inc()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
var send []tsdb.RefSample
|
||||||
|
for _, s := range samples {
|
||||||
|
if s.T > w.startTime {
|
||||||
|
send = append(send, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(send) > 0 {
|
||||||
|
// We don't want to count samples read prior to the starting timestamp
|
||||||
|
// so that we can compare samples in vs samples read and succeeded samples.
|
||||||
|
w.samplesReadMetric.Add(float64(len(samples)))
|
||||||
|
// Blocks until the sample is sent to all remote write endpoints or closed (because enqueue blocks).
|
||||||
|
w.writer.Append(send)
|
||||||
|
}
|
||||||
|
|
||||||
|
case tsdb.RecordTombstones:
|
||||||
|
w.tombstonesReadMetric.Add(float64(len(samples)))
|
||||||
|
|
||||||
|
case tsdb.RecordInvalid:
|
||||||
|
w.invalidReadMetric.Add(float64(len(samples)))
|
||||||
|
return errors.New("invalid record")
|
||||||
|
|
||||||
|
default:
|
||||||
|
w.recordDecodeFailsMetric.Inc()
|
||||||
|
return errors.New("unknown TSDB record type")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read all the series records from a Checkpoint directory.
|
||||||
|
func (w *WALWatcher) readCheckpoint(checkpointDir string) error {
|
||||||
|
level.Info(w.logger).Log("msg", "reading checkpoint", "dir", checkpointDir)
|
||||||
|
sr, err := wal.NewSegmentsReader(checkpointDir)
|
||||||
|
if err != nil {
|
||||||
|
return errors.Wrap(err, "open checkpoint")
|
||||||
|
}
|
||||||
|
defer sr.Close()
|
||||||
|
|
||||||
|
size, err := getCheckpointSize(checkpointDir)
|
||||||
|
if err != nil {
|
||||||
|
level.Error(w.logger).Log("msg", "error getting checkpoint size", "checkpoint", checkpointDir)
|
||||||
|
return errors.Wrap(err, "get checkpoint size")
|
||||||
|
}
|
||||||
|
|
||||||
|
// w.readSeriesRecords(wal.NewLiveReader(sr), i, size)
|
||||||
|
r := wal.NewLiveReader(sr)
|
||||||
|
w.readSegment(r)
|
||||||
|
if r.TotalRead() != size {
|
||||||
|
level.Warn(w.logger).Log("msg", "may not have read all data from checkpoint")
|
||||||
|
}
|
||||||
|
level.Debug(w.logger).Log("msg", "read series references from checkpoint", "checkpoint", checkpointDir)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkpointNum(dir string) (int, error) {
|
||||||
|
// Checkpoint dir names are in the format checkpoint.000001
|
||||||
|
chunks := strings.Split(dir, ".")
|
||||||
|
if len(chunks) != 2 {
|
||||||
|
return 0, errors.Errorf("invalid checkpoint dir string: %s", dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := strconv.Atoi(chunks[1])
|
||||||
|
if err != nil {
|
||||||
|
return 0, errors.Errorf("invalid checkpoint dir string: %s", dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getCheckpointSize(dir string) (int64, error) {
|
||||||
|
i := int64(0)
|
||||||
|
segs, err := fileutil.ReadDir(dir)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
for _, fn := range segs {
|
||||||
|
num, err := strconv.Atoi(fn)
|
||||||
|
if err != nil {
|
||||||
|
return i, err
|
||||||
|
}
|
||||||
|
sz, err := getSegmentSize(dir, num)
|
||||||
|
if err != nil {
|
||||||
|
return i, err
|
||||||
|
}
|
||||||
|
i += sz
|
||||||
|
}
|
||||||
|
return i, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get size of segment.
|
||||||
|
func getSegmentSize(dir string, index int) (int64, error) {
|
||||||
|
i := int64(-1)
|
||||||
|
fi, err := os.Stat(wal.SegmentName(dir, index))
|
||||||
|
if err == nil {
|
||||||
|
i = fi.Size()
|
||||||
|
}
|
||||||
|
return i, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func isClosed(c chan struct{}) bool {
|
||||||
|
select {
|
||||||
|
case <-c:
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
417
storage/remote/wal_watcher_test.go
Normal file
417
storage/remote/wal_watcher_test.go
Normal file
|
@ -0,0 +1,417 @@
|
||||||
|
// Copyright 2018 The Prometheus Authors
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
package remote
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io/ioutil"
|
||||||
|
"math/rand"
|
||||||
|
"os"
|
||||||
|
"path"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/prometheus/prometheus/pkg/timestamp"
|
||||||
|
"github.com/prometheus/prometheus/util/testutil"
|
||||||
|
"github.com/prometheus/tsdb"
|
||||||
|
"github.com/prometheus/tsdb/labels"
|
||||||
|
"github.com/prometheus/tsdb/wal"
|
||||||
|
)
|
||||||
|
|
||||||
|
var defaultRetryInterval = 100 * time.Millisecond
|
||||||
|
var defaultRetries = 100
|
||||||
|
|
||||||
|
// retry executes f() n times at each interval until it returns true.
|
||||||
|
func retry(t *testing.T, interval time.Duration, n int, f func() bool) {
|
||||||
|
t.Helper()
|
||||||
|
ticker := time.NewTicker(interval)
|
||||||
|
for i := 0; i <= n; i++ {
|
||||||
|
if f() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
t.Logf("retry %d/%d", i, n)
|
||||||
|
<-ticker.C
|
||||||
|
}
|
||||||
|
ticker.Stop()
|
||||||
|
t.Logf("function returned false")
|
||||||
|
}
|
||||||
|
|
||||||
|
type writeToMock struct {
|
||||||
|
samplesAppended int
|
||||||
|
seriesLock sync.Mutex
|
||||||
|
seriesSegmentIndexes map[uint64]int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (wtm *writeToMock) Append(s []tsdb.RefSample) bool {
|
||||||
|
wtm.samplesAppended += len(s)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (wtm *writeToMock) StoreSeries(series []tsdb.RefSeries, index int) {
|
||||||
|
wtm.seriesLock.Lock()
|
||||||
|
defer wtm.seriesLock.Unlock()
|
||||||
|
for _, s := range series {
|
||||||
|
wtm.seriesSegmentIndexes[s.Ref] = index
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (wtm *writeToMock) SeriesReset(index int) {
|
||||||
|
// Check for series that are in segments older than the checkpoint
|
||||||
|
// that were not also present in the checkpoint.
|
||||||
|
wtm.seriesLock.Lock()
|
||||||
|
defer wtm.seriesLock.Unlock()
|
||||||
|
for k, v := range wtm.seriesSegmentIndexes {
|
||||||
|
if v < index {
|
||||||
|
delete(wtm.seriesSegmentIndexes, k)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (wtm *writeToMock) checkNumLabels() int {
|
||||||
|
wtm.seriesLock.Lock()
|
||||||
|
defer wtm.seriesLock.Unlock()
|
||||||
|
return len(wtm.seriesSegmentIndexes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func newWriteToMock() *writeToMock {
|
||||||
|
return &writeToMock{
|
||||||
|
seriesSegmentIndexes: make(map[uint64]int),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_readToEnd_noCheckpoint(t *testing.T) {
|
||||||
|
pageSize := 32 * 1024
|
||||||
|
const seriesCount = 10
|
||||||
|
const samplesCount = 250
|
||||||
|
|
||||||
|
dir, err := ioutil.TempDir("", "readToEnd_noCheckpoint")
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
wdir := path.Join(dir, "wal")
|
||||||
|
err = os.Mkdir(wdir, 0777)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
w, err := wal.NewSize(nil, nil, wdir, 128*pageSize)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
var recs [][]byte
|
||||||
|
|
||||||
|
enc := tsdb.RecordEncoder{}
|
||||||
|
|
||||||
|
for i := 0; i < seriesCount; i++ {
|
||||||
|
series := enc.Series([]tsdb.RefSeries{
|
||||||
|
tsdb.RefSeries{
|
||||||
|
Ref: uint64(i),
|
||||||
|
Labels: labels.Labels{labels.Label{"__name__", fmt.Sprintf("metric_%d", i)}},
|
||||||
|
},
|
||||||
|
}, nil)
|
||||||
|
recs = append(recs, series)
|
||||||
|
for j := 0; j < samplesCount; j++ {
|
||||||
|
sample := enc.Samples([]tsdb.RefSample{
|
||||||
|
tsdb.RefSample{
|
||||||
|
Ref: uint64(j),
|
||||||
|
T: int64(i),
|
||||||
|
V: float64(i),
|
||||||
|
},
|
||||||
|
}, nil)
|
||||||
|
|
||||||
|
recs = append(recs, sample)
|
||||||
|
|
||||||
|
// Randomly batch up records.
|
||||||
|
if rand.Intn(4) < 3 {
|
||||||
|
testutil.Ok(t, w.Log(recs...))
|
||||||
|
recs = recs[:0]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
testutil.Ok(t, w.Log(recs...))
|
||||||
|
|
||||||
|
_, _, err = w.Segments()
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
wt := newWriteToMock()
|
||||||
|
st := timestamp.FromTime(time.Now())
|
||||||
|
watcher := NewWALWatcher(nil, "", wt, dir, st)
|
||||||
|
go watcher.Start()
|
||||||
|
|
||||||
|
expected := seriesCount
|
||||||
|
retry(t, defaultRetryInterval, defaultRetries, func() bool {
|
||||||
|
return wt.checkNumLabels() >= expected
|
||||||
|
})
|
||||||
|
watcher.Stop()
|
||||||
|
testutil.Equals(t, expected, wt.checkNumLabels())
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_readToEnd_withCheckpoint(t *testing.T) {
|
||||||
|
pageSize := 32 * 1024
|
||||||
|
const seriesCount = 10
|
||||||
|
const samplesCount = 250
|
||||||
|
|
||||||
|
dir, err := ioutil.TempDir("", "readToEnd_withCheckpoint")
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
|
||||||
|
wdir := path.Join(dir, "wal")
|
||||||
|
err = os.Mkdir(wdir, 0777)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
os.Create(wal.SegmentName(wdir, 30))
|
||||||
|
|
||||||
|
enc := tsdb.RecordEncoder{}
|
||||||
|
w, err := wal.NewSize(nil, nil, wdir, 128*pageSize)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
// Write to the initial segment then checkpoint.
|
||||||
|
for i := 0; i < seriesCount*10; i++ {
|
||||||
|
ref := i + 100
|
||||||
|
series := enc.Series([]tsdb.RefSeries{
|
||||||
|
tsdb.RefSeries{
|
||||||
|
Ref: uint64(ref),
|
||||||
|
Labels: labels.Labels{labels.Label{"__name__", fmt.Sprintf("metric_%d", i)}},
|
||||||
|
},
|
||||||
|
}, nil)
|
||||||
|
testutil.Ok(t, w.Log(series))
|
||||||
|
|
||||||
|
for j := 0; j < samplesCount*10; j++ {
|
||||||
|
inner := rand.Intn(ref + 1)
|
||||||
|
sample := enc.Samples([]tsdb.RefSample{
|
||||||
|
tsdb.RefSample{
|
||||||
|
Ref: uint64(inner),
|
||||||
|
T: int64(i),
|
||||||
|
V: float64(i),
|
||||||
|
},
|
||||||
|
}, nil)
|
||||||
|
testutil.Ok(t, w.Log(sample))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tsdb.Checkpoint(w, 30, 31, func(x uint64) bool { return true }, 0)
|
||||||
|
w.Truncate(32)
|
||||||
|
|
||||||
|
// Write more records after checkpointing.
|
||||||
|
for i := 0; i < seriesCount*10; i++ {
|
||||||
|
series := enc.Series([]tsdb.RefSeries{
|
||||||
|
tsdb.RefSeries{
|
||||||
|
Ref: uint64(i),
|
||||||
|
Labels: labels.Labels{labels.Label{"__name__", fmt.Sprintf("metric_%d", i)}},
|
||||||
|
},
|
||||||
|
}, nil)
|
||||||
|
testutil.Ok(t, w.Log(series))
|
||||||
|
|
||||||
|
for j := 0; j < samplesCount*10; j++ {
|
||||||
|
sample := enc.Samples([]tsdb.RefSample{
|
||||||
|
tsdb.RefSample{
|
||||||
|
Ref: uint64(j),
|
||||||
|
T: int64(i),
|
||||||
|
V: float64(i),
|
||||||
|
},
|
||||||
|
}, nil)
|
||||||
|
testutil.Ok(t, w.Log(sample))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_, _, err = w.Segments()
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
wt := newWriteToMock()
|
||||||
|
st := timestamp.FromTime(time.Now())
|
||||||
|
watcher := NewWALWatcher(nil, "", wt, dir, st)
|
||||||
|
go watcher.Start()
|
||||||
|
|
||||||
|
expected := seriesCount * 10 * 2
|
||||||
|
retry(t, defaultRetryInterval, defaultRetries, func() bool {
|
||||||
|
return wt.checkNumLabels() >= expected
|
||||||
|
})
|
||||||
|
watcher.Stop()
|
||||||
|
testutil.Equals(t, expected, wt.checkNumLabels())
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_readCheckpoint(t *testing.T) {
|
||||||
|
pageSize := 32 * 1024
|
||||||
|
const seriesCount = 10
|
||||||
|
const samplesCount = 250
|
||||||
|
|
||||||
|
dir, err := ioutil.TempDir("", "readCheckpoint")
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
|
||||||
|
wdir := path.Join(dir, "wal")
|
||||||
|
err = os.Mkdir(wdir, 0777)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
os.Create(wal.SegmentName(wdir, 30))
|
||||||
|
|
||||||
|
enc := tsdb.RecordEncoder{}
|
||||||
|
w, err := wal.NewSize(nil, nil, wdir, 128*pageSize)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
// Write to the initial segment then checkpoint.
|
||||||
|
for i := 0; i < seriesCount*10; i++ {
|
||||||
|
ref := i + 100
|
||||||
|
series := enc.Series([]tsdb.RefSeries{
|
||||||
|
tsdb.RefSeries{
|
||||||
|
Ref: uint64(ref),
|
||||||
|
Labels: labels.Labels{labels.Label{"__name__", fmt.Sprintf("metric_%d", i)}},
|
||||||
|
},
|
||||||
|
}, nil)
|
||||||
|
testutil.Ok(t, w.Log(series))
|
||||||
|
|
||||||
|
for j := 0; j < samplesCount*10; j++ {
|
||||||
|
inner := rand.Intn(ref + 1)
|
||||||
|
sample := enc.Samples([]tsdb.RefSample{
|
||||||
|
tsdb.RefSample{
|
||||||
|
Ref: uint64(inner),
|
||||||
|
T: int64(i),
|
||||||
|
V: float64(i),
|
||||||
|
},
|
||||||
|
}, nil)
|
||||||
|
testutil.Ok(t, w.Log(sample))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tsdb.Checkpoint(w, 30, 31, func(x uint64) bool { return true }, 0)
|
||||||
|
w.Truncate(32)
|
||||||
|
|
||||||
|
// Start read after checkpoint, no more data written.
|
||||||
|
_, _, err = w.Segments()
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
wt := newWriteToMock()
|
||||||
|
st := timestamp.FromTime(time.Now())
|
||||||
|
watcher := NewWALWatcher(nil, "", wt, dir, st)
|
||||||
|
go watcher.Start()
|
||||||
|
|
||||||
|
expected := seriesCount * 10
|
||||||
|
retry(t, defaultRetryInterval, defaultRetries, func() bool {
|
||||||
|
return wt.checkNumLabels() >= expected
|
||||||
|
})
|
||||||
|
watcher.Stop()
|
||||||
|
testutil.Equals(t, expected, wt.checkNumLabels())
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_checkpoint_seriesReset(t *testing.T) {
|
||||||
|
pageSize := 32 * 1024
|
||||||
|
const seriesCount = 10
|
||||||
|
const samplesCount = 250
|
||||||
|
|
||||||
|
dir, err := ioutil.TempDir("", "seriesReset")
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
|
||||||
|
wdir := path.Join(dir, "wal")
|
||||||
|
err = os.Mkdir(wdir, 0777)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
enc := tsdb.RecordEncoder{}
|
||||||
|
w, err := wal.NewSize(nil, nil, wdir, pageSize)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
// Write to the initial segment, then checkpoint later.
|
||||||
|
for i := 0; i < seriesCount*10; i++ {
|
||||||
|
ref := i + 100
|
||||||
|
series := enc.Series([]tsdb.RefSeries{
|
||||||
|
tsdb.RefSeries{
|
||||||
|
Ref: uint64(ref),
|
||||||
|
Labels: labels.Labels{labels.Label{"__name__", fmt.Sprintf("metric_%d", i)}},
|
||||||
|
},
|
||||||
|
}, nil)
|
||||||
|
testutil.Ok(t, w.Log(series))
|
||||||
|
|
||||||
|
for j := 0; j < samplesCount*10; j++ {
|
||||||
|
inner := rand.Intn(ref + 1)
|
||||||
|
sample := enc.Samples([]tsdb.RefSample{
|
||||||
|
tsdb.RefSample{
|
||||||
|
Ref: uint64(inner),
|
||||||
|
T: int64(i),
|
||||||
|
V: float64(i),
|
||||||
|
},
|
||||||
|
}, nil)
|
||||||
|
testutil.Ok(t, w.Log(sample))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_, _, err = w.Segments()
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
wt := newWriteToMock()
|
||||||
|
st := timestamp.FromTime(time.Now())
|
||||||
|
watcher := NewWALWatcher(nil, "", wt, dir, st)
|
||||||
|
go watcher.Start()
|
||||||
|
|
||||||
|
expected := seriesCount * 10
|
||||||
|
retry(t, defaultRetryInterval, defaultRetries, func() bool {
|
||||||
|
return wt.checkNumLabels() >= expected
|
||||||
|
})
|
||||||
|
watcher.Stop()
|
||||||
|
testutil.Equals(t, seriesCount*10, wt.checkNumLabels())
|
||||||
|
|
||||||
|
// If you modify the checkpoint and truncate segment #'s run the test to see how
|
||||||
|
// many series records you end up with and change the last Equals check accordingly
|
||||||
|
// or modify the Equals to Assert(len(wt.seriesLabels) < seriesCount*10)
|
||||||
|
_, err = tsdb.Checkpoint(w, 50, 200, func(x uint64) bool { return true }, 0)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
w.Truncate(200)
|
||||||
|
|
||||||
|
cp, _, err := tsdb.LastCheckpoint(path.Join(dir, "wal"))
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
err = watcher.readCheckpoint(cp)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_decodeRecord(t *testing.T) {
|
||||||
|
dir, err := ioutil.TempDir("", "decodeRecord")
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
|
||||||
|
wt := newWriteToMock()
|
||||||
|
// st := timestamp.FromTime(time.Now().Add(-10 * time.Second))
|
||||||
|
watcher := NewWALWatcher(nil, "", wt, dir, 0)
|
||||||
|
|
||||||
|
// decode a series record
|
||||||
|
enc := tsdb.RecordEncoder{}
|
||||||
|
buf := enc.Series([]tsdb.RefSeries{tsdb.RefSeries{Ref: 1234, Labels: labels.Labels{}}}, nil)
|
||||||
|
watcher.decodeRecord(buf)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
testutil.Equals(t, 1, wt.checkNumLabels())
|
||||||
|
|
||||||
|
// decode a samples record
|
||||||
|
buf = enc.Samples([]tsdb.RefSample{tsdb.RefSample{Ref: 100, T: 1, V: 1.0}, tsdb.RefSample{Ref: 100, T: 2, V: 2.0}}, nil)
|
||||||
|
watcher.decodeRecord(buf)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
testutil.Equals(t, 2, wt.samplesAppended)
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_decodeRecord_afterStart(t *testing.T) {
|
||||||
|
dir, err := ioutil.TempDir("", "decodeRecord")
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
|
||||||
|
wt := newWriteToMock()
|
||||||
|
// st := timestamp.FromTime(time.Now().Add(-10 * time.Second))
|
||||||
|
watcher := NewWALWatcher(nil, "", wt, dir, 1)
|
||||||
|
|
||||||
|
// decode a series record
|
||||||
|
enc := tsdb.RecordEncoder{}
|
||||||
|
buf := enc.Series([]tsdb.RefSeries{tsdb.RefSeries{Ref: 1234, Labels: labels.Labels{}}}, nil)
|
||||||
|
watcher.decodeRecord(buf)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
testutil.Equals(t, 1, wt.checkNumLabels())
|
||||||
|
|
||||||
|
// decode a samples record
|
||||||
|
buf = enc.Samples([]tsdb.RefSample{tsdb.RefSample{Ref: 100, T: 1, V: 1.0}, tsdb.RefSample{Ref: 100, T: 2, V: 2.0}}, nil)
|
||||||
|
watcher.decodeRecord(buf)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
testutil.Equals(t, 1, wt.samplesAppended)
|
||||||
|
}
|
|
@ -14,44 +14,53 @@
|
||||||
package remote
|
package remote
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/prometheus/common/model"
|
|
||||||
"github.com/prometheus/prometheus/pkg/labels"
|
"github.com/prometheus/prometheus/pkg/labels"
|
||||||
"github.com/prometheus/prometheus/storage"
|
"github.com/prometheus/prometheus/storage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Appender implements scrape.Appendable.
|
// Appender implements scrape.Appendable.
|
||||||
func (s *Storage) Appender() (storage.Appender, error) {
|
func (s *Storage) Appender() (storage.Appender, error) {
|
||||||
return s, nil
|
return ×tampTracker{
|
||||||
|
storage: s,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type timestampTracker struct {
|
||||||
|
storage *Storage
|
||||||
|
samples int64
|
||||||
|
highestTimestamp int64
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add implements storage.Appender.
|
// Add implements storage.Appender.
|
||||||
func (s *Storage) Add(l labels.Labels, t int64, v float64) (uint64, error) {
|
func (t *timestampTracker) Add(_ labels.Labels, ts int64, v float64) (uint64, error) {
|
||||||
s.mtx.RLock()
|
t.samples++
|
||||||
defer s.mtx.RUnlock()
|
if ts > t.highestTimestamp {
|
||||||
for _, q := range s.queues {
|
t.highestTimestamp = ts
|
||||||
if err := q.Append(&model.Sample{
|
|
||||||
Metric: labelsToMetric(l),
|
|
||||||
Timestamp: model.Time(t),
|
|
||||||
Value: model.SampleValue(v),
|
|
||||||
}); err != nil {
|
|
||||||
panic(err) // QueueManager.Append() should always return nil as per doc string.
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return 0, nil
|
return 0, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// AddFast implements storage.Appender.
|
// AddFast implements storage.Appender.
|
||||||
func (s *Storage) AddFast(l labels.Labels, _ uint64, t int64, v float64) error {
|
func (t *timestampTracker) AddFast(l labels.Labels, _ uint64, ts int64, v float64) error {
|
||||||
_, err := s.Add(l, t, v)
|
_, err := t.Add(l, ts, v)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Commit implements storage.Appender.
|
// Commit implements storage.Appender.
|
||||||
func (*Storage) Commit() error {
|
func (t *timestampTracker) Commit() error {
|
||||||
|
t.storage.samplesIn.incr(t.samples)
|
||||||
|
t.storage.samplesInMetric.Add(float64(t.samples))
|
||||||
|
|
||||||
|
t.storage.highestTimestampMtx.Lock()
|
||||||
|
defer t.storage.highestTimestampMtx.Unlock()
|
||||||
|
if t.highestTimestamp > t.storage.highestTimestamp {
|
||||||
|
t.storage.highestTimestamp = t.highestTimestamp
|
||||||
|
t.storage.highestTimestampMetric.Set(float64(t.highestTimestamp))
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Rollback implements storage.Appender.
|
// Rollback implements storage.Appender.
|
||||||
func (*Storage) Rollback() error {
|
func (*timestampTracker) Rollback() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
4
vendor/modules.txt
vendored
4
vendor/modules.txt
vendored
|
@ -243,12 +243,12 @@ github.com/prometheus/procfs/xfs
|
||||||
github.com/prometheus/procfs/internal/util
|
github.com/prometheus/procfs/internal/util
|
||||||
# github.com/prometheus/tsdb v0.4.0
|
# github.com/prometheus/tsdb v0.4.0
|
||||||
github.com/prometheus/tsdb
|
github.com/prometheus/tsdb
|
||||||
|
github.com/prometheus/tsdb/fileutil
|
||||||
|
github.com/prometheus/tsdb/wal
|
||||||
github.com/prometheus/tsdb/labels
|
github.com/prometheus/tsdb/labels
|
||||||
github.com/prometheus/tsdb/chunkenc
|
github.com/prometheus/tsdb/chunkenc
|
||||||
github.com/prometheus/tsdb/chunks
|
github.com/prometheus/tsdb/chunks
|
||||||
github.com/prometheus/tsdb/fileutil
|
|
||||||
github.com/prometheus/tsdb/index
|
github.com/prometheus/tsdb/index
|
||||||
github.com/prometheus/tsdb/wal
|
|
||||||
# github.com/samuel/go-zookeeper v0.0.0-20161028232340-1d7be4effb13
|
# github.com/samuel/go-zookeeper v0.0.0-20161028232340-1d7be4effb13
|
||||||
github.com/samuel/go-zookeeper/zk
|
github.com/samuel/go-zookeeper/zk
|
||||||
# github.com/sasha-s/go-deadlock v0.0.0-20161201235124-341000892f3d
|
# github.com/sasha-s/go-deadlock v0.0.0-20161201235124-341000892f3d
|
||||||
|
|
|
@ -33,6 +33,7 @@ import (
|
||||||
"github.com/go-kit/kit/log"
|
"github.com/go-kit/kit/log"
|
||||||
"github.com/gogo/protobuf/proto"
|
"github.com/gogo/protobuf/proto"
|
||||||
"github.com/golang/snappy"
|
"github.com/golang/snappy"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
config_util "github.com/prometheus/common/config"
|
config_util "github.com/prometheus/common/config"
|
||||||
"github.com/prometheus/common/model"
|
"github.com/prometheus/common/model"
|
||||||
"github.com/prometheus/common/promlog"
|
"github.com/prometheus/common/promlog"
|
||||||
|
@ -279,9 +280,14 @@ func TestEndpoints(t *testing.T) {
|
||||||
Format: &af,
|
Format: &af,
|
||||||
}
|
}
|
||||||
|
|
||||||
remote := remote.NewStorage(promlog.New(&promlogConfig), func() (int64, error) {
|
dbDir, err := ioutil.TempDir("", "tsdb-api-ready")
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
defer os.RemoveAll(dbDir)
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
remote := remote.NewStorage(promlog.New(&promlogConfig), prometheus.DefaultRegisterer, func() (int64, error) {
|
||||||
return 0, nil
|
return 0, nil
|
||||||
}, 1*time.Second)
|
}, dbDir, 1*time.Second)
|
||||||
|
|
||||||
err = remote.ApplyConfig(&config.Config{
|
err = remote.ApplyConfig(&config.Config{
|
||||||
RemoteReadConfigs: []*config.RemoteReadConfig{
|
RemoteReadConfigs: []*config.RemoteReadConfig{
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -62,6 +62,12 @@ Prometheus.Graph.prototype.initialize = function() {
|
||||||
var graphWrapper = self.el.find("#graph_wrapper" + self.id);
|
var graphWrapper = self.el.find("#graph_wrapper" + self.id);
|
||||||
self.queryForm = graphWrapper.find(".query_form");
|
self.queryForm = graphWrapper.find(".query_form");
|
||||||
|
|
||||||
|
// Auto-resize the text area on input or mouseclick
|
||||||
|
var resizeTextarea = function(el) {
|
||||||
|
var offset = el.offsetHeight - el.clientHeight;
|
||||||
|
$(el).css('height', 'auto').css('height', el.scrollHeight + offset);
|
||||||
|
};
|
||||||
|
|
||||||
self.expr = graphWrapper.find("textarea[name=expr]");
|
self.expr = graphWrapper.find("textarea[name=expr]");
|
||||||
self.expr.keypress(function(e) {
|
self.expr.keypress(function(e) {
|
||||||
const enter = 13;
|
const enter = 13;
|
||||||
|
@ -69,14 +75,13 @@ Prometheus.Graph.prototype.initialize = function() {
|
||||||
self.queryForm.submit();
|
self.queryForm.submit();
|
||||||
e.preventDefault();
|
e.preventDefault();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Auto-resize the text area on input.
|
|
||||||
var offset = this.offsetHeight - this.clientHeight;
|
|
||||||
var resizeTextarea = function(el) {
|
|
||||||
$(el).css('height', 'auto').css('height', el.scrollHeight + offset);
|
|
||||||
};
|
|
||||||
$(this).on('keyup input', function() { resizeTextarea(this); });
|
$(this).on('keyup input', function() { resizeTextarea(this); });
|
||||||
});
|
});
|
||||||
|
|
||||||
|
self.expr.click(function(e) {
|
||||||
|
resizeTextarea(this);
|
||||||
|
});
|
||||||
|
|
||||||
self.expr.change(self.handleChange);
|
self.expr.change(self.handleChange);
|
||||||
|
|
||||||
self.rangeInput = self.queryForm.find("input[name=range_input]");
|
self.rangeInput = self.queryForm.find("input[name=range_input]");
|
||||||
|
|
|
@ -21,6 +21,14 @@ function showUnhealthy(_, container) {
|
||||||
}
|
}
|
||||||
|
|
||||||
function init() {
|
function init() {
|
||||||
|
if (!localStorage.selectedTab || localStorage.selectedTab == "all-targets"){
|
||||||
|
$("#all-targets").parent().addClass("active");
|
||||||
|
$(".table-container").each(showAll);
|
||||||
|
} else if (localStorage.selectedTab == "unhealthy-targets") {
|
||||||
|
$("#unhealthy-targets").parent().addClass("active");
|
||||||
|
$(".table-container").each(showUnhealthy);
|
||||||
|
}
|
||||||
|
|
||||||
$("button.targets").click(function () {
|
$("button.targets").click(function () {
|
||||||
const tableTitle = $(this).closest("h2").find("a").attr("id");
|
const tableTitle = $(this).closest("h2").find("a").attr("id");
|
||||||
|
|
||||||
|
@ -45,8 +53,10 @@ function init() {
|
||||||
|
|
||||||
if (target === "all-targets") {
|
if (target === "all-targets") {
|
||||||
$(".table-container").each(showAll);
|
$(".table-container").each(showAll);
|
||||||
|
localStorage.setItem("selectedTab", "all-targets");
|
||||||
} else if (target === "unhealthy-targets") {
|
} else if (target === "unhealthy-targets") {
|
||||||
$(".table-container").each(showUnhealthy);
|
$(".table-container").each(showUnhealthy);
|
||||||
|
localStorage.setItem("selectedTab", "unhealthy-targets");
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
<div class="container-fluid">
|
<div class="container-fluid">
|
||||||
<h1>Targets</h1>
|
<h1>Targets</h1>
|
||||||
<div id="showTargets" class="btn-group btn-group-toggle" data-toggle="buttons">
|
<div id="showTargets" class="btn-group btn-group-toggle" data-toggle="buttons">
|
||||||
<label class="btn btn-primary active">
|
<label class="btn btn-primary">
|
||||||
<input type="radio" name="targets" id="all-targets" autocomplete="off" checked> All
|
<input type="radio" name="targets" id="all-targets" autocomplete="off" checked> All
|
||||||
</label>
|
</label>
|
||||||
<label class="btn btn-primary ml-1">
|
<label class="btn btn-primary ml-1">
|
||||||
|
|
|
@ -25,6 +25,10 @@ import (
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/prometheus/prometheus/config"
|
||||||
|
"github.com/prometheus/prometheus/notifier"
|
||||||
|
"github.com/prometheus/prometheus/rules"
|
||||||
|
"github.com/prometheus/prometheus/scrape"
|
||||||
"github.com/prometheus/prometheus/storage/tsdb"
|
"github.com/prometheus/prometheus/storage/tsdb"
|
||||||
"github.com/prometheus/prometheus/util/testutil"
|
"github.com/prometheus/prometheus/util/testutil"
|
||||||
libtsdb "github.com/prometheus/tsdb"
|
libtsdb "github.com/prometheus/tsdb"
|
||||||
|
@ -101,8 +105,8 @@ func TestReadyAndHealthy(t *testing.T) {
|
||||||
Context: nil,
|
Context: nil,
|
||||||
Storage: &tsdb.ReadyStorage{},
|
Storage: &tsdb.ReadyStorage{},
|
||||||
QueryEngine: nil,
|
QueryEngine: nil,
|
||||||
ScrapeManager: nil,
|
ScrapeManager: &scrape.Manager{},
|
||||||
RuleManager: nil,
|
RuleManager: &rules.Manager{},
|
||||||
Notifier: nil,
|
Notifier: nil,
|
||||||
RoutePrefix: "/",
|
RoutePrefix: "/",
|
||||||
EnableAdminAPI: true,
|
EnableAdminAPI: true,
|
||||||
|
@ -118,6 +122,10 @@ func TestReadyAndHealthy(t *testing.T) {
|
||||||
opts.Flags = map[string]string{}
|
opts.Flags = map[string]string{}
|
||||||
|
|
||||||
webHandler := New(nil, opts)
|
webHandler := New(nil, opts)
|
||||||
|
|
||||||
|
webHandler.config = &config.Config{}
|
||||||
|
webHandler.notifier = ¬ifier.Manager{}
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
err := webHandler.Run(context.Background())
|
err := webHandler.Run(context.Background())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -159,6 +167,46 @@ func TestReadyAndHealthy(t *testing.T) {
|
||||||
testutil.Ok(t, err)
|
testutil.Ok(t, err)
|
||||||
testutil.Equals(t, http.StatusServiceUnavailable, resp.StatusCode)
|
testutil.Equals(t, http.StatusServiceUnavailable, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/graph")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusServiceUnavailable, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/alerts")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusServiceUnavailable, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/flags")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusServiceUnavailable, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/rules")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusServiceUnavailable, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/service-discovery")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusServiceUnavailable, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/targets")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusServiceUnavailable, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/config")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusServiceUnavailable, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/status")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusServiceUnavailable, resp.StatusCode)
|
||||||
|
|
||||||
// Set to ready.
|
// Set to ready.
|
||||||
webHandler.Ready()
|
webHandler.Ready()
|
||||||
|
|
||||||
|
@ -191,6 +239,41 @@ func TestReadyAndHealthy(t *testing.T) {
|
||||||
|
|
||||||
testutil.Ok(t, err)
|
testutil.Ok(t, err)
|
||||||
testutil.Equals(t, http.StatusOK, resp.StatusCode)
|
testutil.Equals(t, http.StatusOK, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/alerts")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusOK, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/flags")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusOK, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/rules")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusOK, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/service-discovery")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusOK, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/targets")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusOK, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/config")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusOK, resp.StatusCode)
|
||||||
|
|
||||||
|
resp, err = http.Get("http://localhost:9090/status")
|
||||||
|
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, http.StatusOK, resp.StatusCode)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRoutePrefix(t *testing.T) {
|
func TestRoutePrefix(t *testing.T) {
|
||||||
|
|
Loading…
Reference in a new issue