Merge remote-tracking branch 'tsdb/merge-tsdb' into merge-tsdb-new

This commit is contained in:
Ganesh Vernekar 2019-08-13 13:59:13 +05:30
commit 750e438ebb
No known key found for this signature in database
GPG key ID: 0241A11211763456
98 changed files with 48708 additions and 0 deletions

17
tsdb/.github/PULL_REQUEST_TEMPLATE.md vendored Normal file
View file

@ -0,0 +1,17 @@
<!--
Don't forget!
- Most PRs would require a CHANGELOG entry.
- If the PR adds or changes a behaviour or fixes a bug of an exported API it would need a unit/e2e test.
- Where possible use only exported APIs for tests to simplify the review and make it as close as possible to an actual library usage.
- No tests are needed for internal implementation changes.
- Performance improvements would need a benchmark test to prove it.
- All exposed objects should have a comment.
- All comments should start with a capital letter and end with a full stop.
-->

1
tsdb/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
benchout/

5
tsdb/.golangci.yml Normal file
View file

@ -0,0 +1,5 @@
# Run only staticcheck for now. Additional linters will be enabled one-by-one.
linters:
enable:
- staticcheck
disable-all: true

20
tsdb/.travis.yml Normal file
View file

@ -0,0 +1,20 @@
dist: trusty
language: go
os:
- windows
- linux
- osx
go:
- 1.12.x
go_import_path: github.com/prometheus/tsdb
before_install:
- if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then choco install make; fi
install:
- make deps
script:
- if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then make test; else make all; fi

108
tsdb/CHANGELOG.md Normal file
View file

@ -0,0 +1,108 @@
## master / unreleased
## 0.10.0
- [FEATURE] Added `DBReadOnly` to allow opening a database in read only mode.
- `DBReadOnly.Blocks()` exposes a slice of `BlockReader`s.
- `BlockReader` interface - removed MinTime/MaxTime methods and now exposes the full block meta via `Meta()`.
- [FEATURE] `chunckenc.Chunk.Iterator` method now takes a `chunckenc.Iterator` interface as an argument for reuse.
## 0.9.1
- [CHANGE] LiveReader metrics are now injected rather than global.
## 0.9.0
- [FEATURE] Provide option to compress WAL records using Snappy. [#609](https://github.com/prometheus/tsdb/pull/609)
- [BUGFIX] Re-calculate block size when calling `block.Delete`.
- [BUGFIX] Re-encode all head chunks at compaction that are open (being appended to) or outside the Maxt block range. This avoids writing out corrupt data. It happens when snapshotting with the head included.
- [BUGFIX] Improved handling of multiple refs for the same series in WAL reading.
- [BUGFIX] `prometheus_tsdb_compactions_failed_total` is now incremented on any compaction failure.
- [CHANGE] The meta file `BlockStats` no longer holds size information. This is now dynamically calculated and kept in memory. It also includes the meta file size which was not included before.
- [CHANGE] Create new clean segment when starting the WAL.
- [CHANGE] Renamed metric from `prometheus_tsdb_wal_reader_corruption_errors` to `prometheus_tsdb_wal_reader_corruption_errors_total`.
- [ENHANCEMENT] Improved atomicity of .tmp block replacement during compaction for usual case.
- [ENHANCEMENT] Improved postings intersection matching.
- [ENHANCEMENT] Reduced disk usage for WAL for small setups.
- [ENHANCEMENT] Optimize queries using regexp for set lookups.
## 0.8.0
- [BUGFIX] Calling `Close` more than once on a querier returns an error instead of a panic.
- [BUGFIX] Don't panic and recover nicely when running out of disk space.
- [BUGFIX] Correctly handle empty labels.
- [BUGFIX] Don't crash on an unknown tombstone ref.
- [ENHANCEMENT] Re-add FromData function to create a chunk from bytes. It is used by Cortex and Thanos.
- [ENHANCEMENT] Simplify mergedPostings.Seek.
- [FEATURE] Added `currentSegment` metric for the current WAL segment it is being written to.
## 0.7.1
- [ENHANCEMENT] Reduce memory usage in mergedPostings.Seek
## 0.7.0
- [CHANGE] tsdb now requires golang 1.12 or higher.
- [REMOVED] `chunks.NewReader` is removed as it wasn't used anywhere.
- [REMOVED] `FromData` is considered unused so was removed.
- [FEATURE] Added option WALSegmentSize -1 to disable the WAL.
- [BUGFIX] Bugfix in selectOverlappingDirs. Only return the first overlapping blocks.
- [BUGFIX] Fsync the meta file to persist it on disk to avoid data loss in case of a host crash.
- [BUGFIX] Fix fd and vm_area leak on error path in chunks.NewDirReader.
- [BUGFIX] Fix fd and vm_area leak on error path in index.NewFileReader.
- [BUGFIX] Force persisting the tombstone file to avoid data loss in case of a host crash.
- [BUGFIX] Keep series that are still in WAL in checkpoints.
- [ENHANCEMENT] Fast path for EmptyPostings cases in Merge, Intersect and Without.
- [ENHANCEMENT] Be smarter in how we look at matchers.
- [ENHANCEMENT] PostListings and NotMatcher now public.
## 0.6.1
- [BUGFIX] Update `last` after appending a non-overlapping chunk in `chunks.MergeOverlappingChunks`. [#539](https://github.com/prometheus/tsdb/pull/539)
## 0.6.0
- [CHANGE] `AllowOverlappingBlock` is now `AllowOverlappingBlocks`.
## 0.5.0
- [FEATURE] Time-ovelapping blocks are now allowed. [#370](https://github.com/prometheus/tsdb/pull/370)
- Disabled by default and can be enabled via `AllowOverlappingBlock` option.
- Added `MergeChunks` function in `chunkenc/xor.go` to merge 2 time-overlapping chunks.
- Added `MergeOverlappingChunks` function in `chunks/chunks.go` to merge multiple time-overlapping Chunk Metas.
- Added `MinTime` and `MaxTime` method for `BlockReader`.
- [FEATURE] New `dump` command to tsdb tool to dump all samples.
- [FEATURE] New `encoding` package for common binary encoding/decoding helpers.
- Added to remove some code duplication.
- [ENHANCEMENT] When closing the db any running compaction will be cancelled so it doesn't block.
- `NewLeveledCompactor` takes a context.
- [CHANGE] `prometheus_tsdb_storage_blocks_bytes_total` is now `prometheus_tsdb_storage_blocks_bytes`.
- [BUGFIX] Improved Postings Merge performance. Fixes a regression from the the previous release.
- [BUGFIX] LiveReader can get into an infinite loop on corrupt WALs.
## 0.4.0
- [CHANGE] New `WALSegmentSize` option to override the `DefaultOptions.WALSegmentSize`. Added to allow using smaller wal files. For example using tmpfs on a RPI to minimise the SD card wear out from the constant WAL writes. As part of this change the `DefaultOptions.WALSegmentSize` constant was also exposed.
- [CHANGE] Empty blocks are not written during compaction [#374](https://github.com/prometheus/tsdb/pull/374)
- [FEATURE] Size base retention through `Options.MaxBytes`. As part of this change:
- Added new metrics - `prometheus_tsdb_storage_blocks_bytes_total`, `prometheus_tsdb_size_retentions_total`, `prometheus_tsdb_time_retentions_total`
- New public interface `SizeReader: Size() int64`
- `OpenBlock` signature changed to take a logger.
- [REMOVED] `PrefixMatcher` is considered unused so was removed.
- [CLEANUP] `Options.WALFlushInterval` is removed as it wasn't used anywhere.
- [FEATURE] Add new `LiveReader` to WAL pacakge. Added to allow live tailing of a WAL segment, used by Prometheus Remote Write after refactor. The main difference between the new reader and the existing `Reader` is that for `LiveReader` a call to `Next()` that returns false does not mean that there will never be more data to read.
## 0.3.1
- [BUGFIX] Fixed most windows test and some actual bugs for unclosed file readers.
## 0.3.0
- [CHANGE] `LastCheckpoint()` used to return just the segment name and now it returns the full relative path.
- [CHANGE] `NewSegmentsRangeReader()` can now read over miltiple wal ranges by using the new `SegmentRange{}` struct.
- [CHANGE] `CorruptionErr{}` now also exposes the Segment `Dir` which is added when displaying any errors.
- [CHANGE] `Head.Init()` is changed to `Head.Init(minValidTime int64)`
- [CHANGE] `SymbolTable()` renamed to `SymbolTableSize()` to make the name consistent with the `Block{ symbolTableSize uint64 }` field.
- [CHANGE] `wal.Reader{}` now exposes `Segment()` for the current segment being read and `Offset()` for the current offset.
- [FEATURE] tsdbutil analyze subcomand to find churn, high cardinality, etc.

201
tsdb/LICENSE Normal file
View file

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

4
tsdb/MAINTAINERS.md Normal file
View file

@ -0,0 +1,4 @@
Maintainers of this repository:
* Krasi Georgiev <kgeorgie@redhat.com> @krasi-georgiev
* Goutham Veeramachaneni <gouthamve@gmail.com> @gouthamve

33
tsdb/Makefile Normal file
View file

@ -0,0 +1,33 @@
# Copyright 2018 The Prometheus Authors
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
TSDB_PROJECT_DIR = "."
TSDB_CLI_DIR="$(TSDB_PROJECT_DIR)/cmd/tsdb"
TSDB_BIN = "$(TSDB_CLI_DIR)/tsdb"
TSDB_BENCHMARK_NUM_METRICS ?= 1000
TSDB_BENCHMARK_DATASET ?= "$(TSDB_PROJECT_DIR)/testdata/20kseries.json"
TSDB_BENCHMARK_OUTPUT_DIR ?= "$(TSDB_CLI_DIR)/benchout"
include Makefile.common
build:
GO111MODULE=$(GO111MODULE) $(GO) build -o $(TSDB_BIN) $(TSDB_CLI_DIR)
bench: build
@echo ">> running benchmark, writing result to $(TSDB_BENCHMARK_OUTPUT_DIR)"
@$(TSDB_BIN) bench write --metrics=$(TSDB_BENCHMARK_NUM_METRICS) --out=$(TSDB_BENCHMARK_OUTPUT_DIR) $(TSDB_BENCHMARK_DATASET)
@$(GO) tool pprof -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/cpu.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/cpuprof.svg
@$(GO) tool pprof --inuse_space -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/mem.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/memprof.inuse.svg
@$(GO) tool pprof --alloc_space -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/mem.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/memprof.alloc.svg
@$(GO) tool pprof -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/block.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/blockprof.svg
@$(GO) tool pprof -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/mutex.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/mutexprof.svg

277
tsdb/Makefile.common Normal file
View file

@ -0,0 +1,277 @@
# Copyright 2018 The Prometheus Authors
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# A common Makefile that includes rules to be reused in different prometheus projects.
# !!! Open PRs only against the prometheus/prometheus/Makefile.common repository!
# Example usage :
# Create the main Makefile in the root project directory.
# include Makefile.common
# customTarget:
# @echo ">> Running customTarget"
#
# Ensure GOBIN is not set during build so that promu is installed to the correct path
unexport GOBIN
GO ?= go
GOFMT ?= $(GO)fmt
FIRST_GOPATH := $(firstword $(subst :, ,$(shell $(GO) env GOPATH)))
GOOPTS ?=
GOHOSTOS ?= $(shell $(GO) env GOHOSTOS)
GOHOSTARCH ?= $(shell $(GO) env GOHOSTARCH)
GO_VERSION ?= $(shell $(GO) version)
GO_VERSION_NUMBER ?= $(word 3, $(GO_VERSION))
PRE_GO_111 ?= $(shell echo $(GO_VERSION_NUMBER) | grep -E 'go1\.(10|[0-9])\.')
GOVENDOR :=
GO111MODULE :=
ifeq (, $(PRE_GO_111))
ifneq (,$(wildcard go.mod))
# Enforce Go modules support just in case the directory is inside GOPATH (and for Travis CI).
GO111MODULE := on
ifneq (,$(wildcard vendor))
# Always use the local vendor/ directory to satisfy the dependencies.
GOOPTS := $(GOOPTS) -mod=vendor
endif
endif
else
ifneq (,$(wildcard go.mod))
ifneq (,$(wildcard vendor))
$(warning This repository requires Go >= 1.11 because of Go modules)
$(warning Some recipes may not work as expected as the current Go runtime is '$(GO_VERSION_NUMBER)')
endif
else
# This repository isn't using Go modules (yet).
GOVENDOR := $(FIRST_GOPATH)/bin/govendor
endif
endif
PROMU := $(FIRST_GOPATH)/bin/promu
pkgs = ./...
ifeq (arm, $(GOHOSTARCH))
GOHOSTARM ?= $(shell GOARM= $(GO) env GOARM)
GO_BUILD_PLATFORM ?= $(GOHOSTOS)-$(GOHOSTARCH)v$(GOHOSTARM)
else
GO_BUILD_PLATFORM ?= $(GOHOSTOS)-$(GOHOSTARCH)
endif
PROMU_VERSION ?= 0.5.0
PROMU_URL := https://github.com/prometheus/promu/releases/download/v$(PROMU_VERSION)/promu-$(PROMU_VERSION).$(GO_BUILD_PLATFORM).tar.gz
GOLANGCI_LINT :=
GOLANGCI_LINT_OPTS ?=
GOLANGCI_LINT_VERSION ?= v1.17.1
# golangci-lint only supports linux, darwin and windows platforms on i386/amd64.
# windows isn't included here because of the path separator being different.
ifeq ($(GOHOSTOS),$(filter $(GOHOSTOS),linux darwin))
ifeq ($(GOHOSTARCH),$(filter $(GOHOSTARCH),amd64 i386))
GOLANGCI_LINT := $(FIRST_GOPATH)/bin/golangci-lint
endif
endif
PREFIX ?= $(shell pwd)
BIN_DIR ?= $(shell pwd)
DOCKER_IMAGE_TAG ?= $(subst /,-,$(shell git rev-parse --abbrev-ref HEAD))
DOCKERFILE_PATH ?= ./Dockerfile
DOCKERBUILD_CONTEXT ?= ./
DOCKER_REPO ?= prom
DOCKER_ARCHS ?= amd64
BUILD_DOCKER_ARCHS = $(addprefix common-docker-,$(DOCKER_ARCHS))
PUBLISH_DOCKER_ARCHS = $(addprefix common-docker-publish-,$(DOCKER_ARCHS))
TAG_DOCKER_ARCHS = $(addprefix common-docker-tag-latest-,$(DOCKER_ARCHS))
ifeq ($(GOHOSTARCH),amd64)
ifeq ($(GOHOSTOS),$(filter $(GOHOSTOS),linux freebsd darwin windows))
# Only supported on amd64
test-flags := -race
endif
endif
# This rule is used to forward a target like "build" to "common-build". This
# allows a new "build" target to be defined in a Makefile which includes this
# one and override "common-build" without override warnings.
%: common-% ;
.PHONY: common-all
common-all: precheck style check_license lint unused build test
.PHONY: common-style
common-style:
@echo ">> checking code style"
@fmtRes=$$($(GOFMT) -d $$(find . -path ./vendor -prune -o -name '*.go' -print)); \
if [ -n "$${fmtRes}" ]; then \
echo "gofmt checking failed!"; echo "$${fmtRes}"; echo; \
echo "Please ensure you are using $$($(GO) version) for formatting code."; \
exit 1; \
fi
.PHONY: common-check_license
common-check_license:
@echo ">> checking license header"
@licRes=$$(for file in $$(find . -type f -iname '*.go' ! -path './vendor/*') ; do \
awk 'NR<=3' $$file | grep -Eq "(Copyright|generated|GENERATED)" || echo $$file; \
done); \
if [ -n "$${licRes}" ]; then \
echo "license header checking failed:"; echo "$${licRes}"; \
exit 1; \
fi
.PHONY: common-deps
common-deps:
@echo ">> getting dependencies"
ifdef GO111MODULE
GO111MODULE=$(GO111MODULE) $(GO) mod download
else
$(GO) get $(GOOPTS) -t ./...
endif
.PHONY: common-test-short
common-test-short:
@echo ">> running short tests"
GO111MODULE=$(GO111MODULE) $(GO) test -short $(GOOPTS) $(pkgs)
.PHONY: common-test
common-test:
@echo ">> running all tests"
GO111MODULE=$(GO111MODULE) $(GO) test $(test-flags) $(GOOPTS) $(pkgs)
.PHONY: common-format
common-format:
@echo ">> formatting code"
GO111MODULE=$(GO111MODULE) $(GO) fmt $(pkgs)
.PHONY: common-vet
common-vet:
@echo ">> vetting code"
GO111MODULE=$(GO111MODULE) $(GO) vet $(GOOPTS) $(pkgs)
.PHONY: common-lint
common-lint: $(GOLANGCI_LINT)
ifdef GOLANGCI_LINT
@echo ">> running golangci-lint"
ifdef GO111MODULE
# 'go list' needs to be executed before staticcheck to prepopulate the modules cache.
# Otherwise staticcheck might fail randomly for some reason not yet explained.
GO111MODULE=$(GO111MODULE) $(GO) list -e -compiled -test=true -export=false -deps=true -find=false -tags= -- ./... > /dev/null
GO111MODULE=$(GO111MODULE) $(GOLANGCI_LINT) run $(GOLANGCI_LINT_OPTS) $(pkgs)
else
$(GOLANGCI_LINT) run $(pkgs)
endif
endif
# For backward-compatibility.
.PHONY: common-staticcheck
common-staticcheck: lint
.PHONY: common-unused
common-unused: $(GOVENDOR)
ifdef GOVENDOR
@echo ">> running check for unused packages"
@$(GOVENDOR) list +unused | grep . && exit 1 || echo 'No unused packages'
else
ifdef GO111MODULE
@echo ">> running check for unused/missing packages in go.mod"
GO111MODULE=$(GO111MODULE) $(GO) mod tidy
ifeq (,$(wildcard vendor))
@git diff --exit-code -- go.sum go.mod
else
@echo ">> running check for unused packages in vendor/"
GO111MODULE=$(GO111MODULE) $(GO) mod vendor
@git diff --exit-code -- go.sum go.mod vendor/
endif
endif
endif
.PHONY: common-build
common-build: promu
@echo ">> building binaries"
GO111MODULE=$(GO111MODULE) $(PROMU) build --prefix $(PREFIX)
.PHONY: common-tarball
common-tarball: promu
@echo ">> building release tarball"
$(PROMU) tarball --prefix $(PREFIX) $(BIN_DIR)
.PHONY: common-docker $(BUILD_DOCKER_ARCHS)
common-docker: $(BUILD_DOCKER_ARCHS)
$(BUILD_DOCKER_ARCHS): common-docker-%:
docker build -t "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(DOCKER_IMAGE_TAG)" \
-f $(DOCKERFILE_PATH) \
--build-arg ARCH="$*" \
--build-arg OS="linux" \
$(DOCKERBUILD_CONTEXT)
.PHONY: common-docker-publish $(PUBLISH_DOCKER_ARCHS)
common-docker-publish: $(PUBLISH_DOCKER_ARCHS)
$(PUBLISH_DOCKER_ARCHS): common-docker-publish-%:
docker push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(DOCKER_IMAGE_TAG)"
.PHONY: common-docker-tag-latest $(TAG_DOCKER_ARCHS)
common-docker-tag-latest: $(TAG_DOCKER_ARCHS)
$(TAG_DOCKER_ARCHS): common-docker-tag-latest-%:
docker tag "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(DOCKER_IMAGE_TAG)" "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:latest"
.PHONY: common-docker-manifest
common-docker-manifest:
DOCKER_CLI_EXPERIMENTAL=enabled docker manifest create -a "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_TAG)" $(foreach ARCH,$(DOCKER_ARCHS),$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$(ARCH):$(DOCKER_IMAGE_TAG))
DOCKER_CLI_EXPERIMENTAL=enabled docker manifest push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_TAG)"
.PHONY: promu
promu: $(PROMU)
$(PROMU):
$(eval PROMU_TMP := $(shell mktemp -d))
curl -s -L $(PROMU_URL) | tar -xvzf - -C $(PROMU_TMP)
mkdir -p $(FIRST_GOPATH)/bin
cp $(PROMU_TMP)/promu-$(PROMU_VERSION).$(GO_BUILD_PLATFORM)/promu $(FIRST_GOPATH)/bin/promu
rm -r $(PROMU_TMP)
.PHONY: proto
proto:
@echo ">> generating code from proto files"
@./scripts/genproto.sh
ifdef GOLANGCI_LINT
$(GOLANGCI_LINT):
mkdir -p $(FIRST_GOPATH)/bin
curl -sfL https://raw.githubusercontent.com/golangci/golangci-lint/$(GOLANGCI_LINT_VERSION)/install.sh \
| sed -e '/install -d/d' \
| sh -s -- -b $(FIRST_GOPATH)/bin $(GOLANGCI_LINT_VERSION)
endif
ifdef GOVENDOR
.PHONY: $(GOVENDOR)
$(GOVENDOR):
GOOS= GOARCH= $(GO) get -u github.com/kardianos/govendor
endif
.PHONY: precheck
precheck::
define PRECHECK_COMMAND_template =
precheck:: $(1)_precheck
PRECHECK_COMMAND_$(1) ?= $(1) $$(strip $$(PRECHECK_OPTIONS_$(1)))
.PHONY: $(1)_precheck
$(1)_precheck:
@if ! $$(PRECHECK_COMMAND_$(1)) 1>/dev/null 2>&1; then \
echo "Execution of '$$(PRECHECK_COMMAND_$(1))' command failed. Is $(1) installed?"; \
exit 1; \
fi
endef

15
tsdb/README.md Normal file
View file

@ -0,0 +1,15 @@
# TSDB
[![Build Status](https://travis-ci.org/prometheus/tsdb.svg?branch=master)](https://travis-ci.org/prometheus/tsdb)
[![GoDoc](https://godoc.org/github.com/prometheus/tsdb?status.svg)](https://godoc.org/github.com/prometheus/tsdb)
[![Go Report Card](https://goreportcard.com/badge/github.com/prometheus/tsdb)](https://goreportcard.com/report/github.com/prometheus/tsdb)
This repository contains the Prometheus storage layer that is used in its 2.x releases.
A writeup of its design can be found [here](https://fabxc.org/blog/2017-04-10-writing-a-tsdb/).
Based on the Gorilla TSDB [white papers](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf).
Video: [Storing 16 Bytes at Scale](https://youtu.be/b_pEevMAC3I) from [PromCon 2017](https://promcon.io/2017-munich/).
See also the [format documentation](docs/format/README.md).

656
tsdb/block.go Normal file
View file

@ -0,0 +1,656 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"encoding/json"
"io"
"io/ioutil"
"os"
"path/filepath"
"sync"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/oklog/ulid"
"github.com/pkg/errors"
"github.com/prometheus/tsdb/chunkenc"
"github.com/prometheus/tsdb/chunks"
tsdb_errors "github.com/prometheus/tsdb/errors"
"github.com/prometheus/tsdb/fileutil"
"github.com/prometheus/tsdb/index"
"github.com/prometheus/tsdb/labels"
)
// IndexWriter serializes the index for a block of series data.
// The methods must be called in the order they are specified in.
type IndexWriter interface {
// AddSymbols registers all string symbols that are encountered in series
// and other indices.
AddSymbols(sym map[string]struct{}) error
// AddSeries populates the index writer with a series and its offsets
// of chunks that the index can reference.
// Implementations may require series to be insert in increasing order by
// their labels.
// The reference numbers are used to resolve entries in postings lists that
// are added later.
AddSeries(ref uint64, l labels.Labels, chunks ...chunks.Meta) error
// WriteLabelIndex serializes an index from label names to values.
// The passed in values chained tuples of strings of the length of names.
WriteLabelIndex(names []string, values []string) error
// WritePostings writes a postings list for a single label pair.
// The Postings here contain refs to the series that were added.
WritePostings(name, value string, it index.Postings) error
// Close writes any finalization and closes the resources associated with
// the underlying writer.
Close() error
}
// IndexReader provides reading access of serialized index data.
type IndexReader interface {
// Symbols returns a set of string symbols that may occur in series' labels
// and indices.
Symbols() (map[string]struct{}, error)
// LabelValues returns the possible label values.
LabelValues(names ...string) (index.StringTuples, error)
// Postings returns the postings list iterator for the label pair.
// The Postings here contain the offsets to the series inside the index.
// Found IDs are not strictly required to point to a valid Series, e.g. during
// background garbage collections.
Postings(name, value string) (index.Postings, error)
// SortedPostings returns a postings list that is reordered to be sorted
// by the label set of the underlying series.
SortedPostings(index.Postings) index.Postings
// Series populates the given labels and chunk metas for the series identified
// by the reference.
// Returns ErrNotFound if the ref does not resolve to a known series.
Series(ref uint64, lset *labels.Labels, chks *[]chunks.Meta) error
// LabelIndices returns a list of string tuples for which a label value index exists.
// NOTE: This is deprecated. Use `LabelNames()` instead.
LabelIndices() ([][]string, error)
// LabelNames returns all the unique label names present in the index in sorted order.
LabelNames() ([]string, error)
// Close releases the underlying resources of the reader.
Close() error
}
// StringTuples provides access to a sorted list of string tuples.
type StringTuples interface {
// Total number of tuples in the list.
Len() int
// At returns the tuple at position i.
At(i int) ([]string, error)
}
// ChunkWriter serializes a time block of chunked series data.
type ChunkWriter interface {
// WriteChunks writes several chunks. The Chunk field of the ChunkMetas
// must be populated.
// After returning successfully, the Ref fields in the ChunkMetas
// are set and can be used to retrieve the chunks from the written data.
WriteChunks(chunks ...chunks.Meta) error
// Close writes any required finalization and closes the resources
// associated with the underlying writer.
Close() error
}
// ChunkReader provides reading access of serialized time series data.
type ChunkReader interface {
// Chunk returns the series data chunk with the given reference.
Chunk(ref uint64) (chunkenc.Chunk, error)
// Close releases all underlying resources of the reader.
Close() error
}
// BlockReader provides reading access to a data block.
type BlockReader interface {
// Index returns an IndexReader over the block's data.
Index() (IndexReader, error)
// Chunks returns a ChunkReader over the block's data.
Chunks() (ChunkReader, error)
// Tombstones returns a TombstoneReader over the block's deleted data.
Tombstones() (TombstoneReader, error)
// Meta provides meta information about the block reader.
Meta() BlockMeta
}
// Appendable defines an entity to which data can be appended.
type Appendable interface {
// Appender returns a new Appender against an underlying store.
Appender() Appender
}
// BlockMeta provides meta information about a block.
type BlockMeta struct {
// Unique identifier for the block and its contents. Changes on compaction.
ULID ulid.ULID `json:"ulid"`
// MinTime and MaxTime specify the time range all samples
// in the block are in.
MinTime int64 `json:"minTime"`
MaxTime int64 `json:"maxTime"`
// Stats about the contents of the block.
Stats BlockStats `json:"stats,omitempty"`
// Information on compactions the block was created from.
Compaction BlockMetaCompaction `json:"compaction"`
// Version of the index format.
Version int `json:"version"`
}
// BlockStats contains stats about contents of a block.
type BlockStats struct {
NumSamples uint64 `json:"numSamples,omitempty"`
NumSeries uint64 `json:"numSeries,omitempty"`
NumChunks uint64 `json:"numChunks,omitempty"`
NumTombstones uint64 `json:"numTombstones,omitempty"`
}
// BlockDesc describes a block by ULID and time range.
type BlockDesc struct {
ULID ulid.ULID `json:"ulid"`
MinTime int64 `json:"minTime"`
MaxTime int64 `json:"maxTime"`
}
// BlockMetaCompaction holds information about compactions a block went through.
type BlockMetaCompaction struct {
// Maximum number of compaction cycles any source block has
// gone through.
Level int `json:"level"`
// ULIDs of all source head blocks that went into the block.
Sources []ulid.ULID `json:"sources,omitempty"`
// Indicates that during compaction it resulted in a block without any samples
// so it should be deleted on the next reload.
Deletable bool `json:"deletable,omitempty"`
// Short descriptions of the direct blocks that were used to create
// this block.
Parents []BlockDesc `json:"parents,omitempty"`
Failed bool `json:"failed,omitempty"`
}
const indexFilename = "index"
const metaFilename = "meta.json"
func chunkDir(dir string) string { return filepath.Join(dir, "chunks") }
func readMetaFile(dir string) (*BlockMeta, int64, error) {
b, err := ioutil.ReadFile(filepath.Join(dir, metaFilename))
if err != nil {
return nil, 0, err
}
var m BlockMeta
if err := json.Unmarshal(b, &m); err != nil {
return nil, 0, err
}
if m.Version != 1 {
return nil, 0, errors.Errorf("unexpected meta file version %d", m.Version)
}
return &m, int64(len(b)), nil
}
func writeMetaFile(logger log.Logger, dir string, meta *BlockMeta) (int64, error) {
meta.Version = 1
// Make any changes to the file appear atomic.
path := filepath.Join(dir, metaFilename)
tmp := path + ".tmp"
defer func() {
if err := os.RemoveAll(tmp); err != nil {
level.Error(logger).Log("msg", "remove tmp file", "err", err.Error())
}
}()
f, err := os.Create(tmp)
if err != nil {
return 0, err
}
jsonMeta, err := json.MarshalIndent(meta, "", "\t")
if err != nil {
return 0, err
}
var merr tsdb_errors.MultiError
n, err := f.Write(jsonMeta)
if err != nil {
merr.Add(err)
merr.Add(f.Close())
return 0, merr.Err()
}
// Force the kernel to persist the file on disk to avoid data loss if the host crashes.
if err := f.Sync(); err != nil {
merr.Add(err)
merr.Add(f.Close())
return 0, merr.Err()
}
if err := f.Close(); err != nil {
return 0, err
}
return int64(n), fileutil.Replace(tmp, path)
}
// Block represents a directory of time series data covering a continuous time range.
type Block struct {
mtx sync.RWMutex
closing bool
pendingReaders sync.WaitGroup
dir string
meta BlockMeta
// Symbol Table Size in bytes.
// We maintain this variable to avoid recalculation everytime.
symbolTableSize uint64
chunkr ChunkReader
indexr IndexReader
tombstones TombstoneReader
logger log.Logger
numBytesChunks int64
numBytesIndex int64
numBytesTombstone int64
numBytesMeta int64
}
// OpenBlock opens the block in the directory. It can be passed a chunk pool, which is used
// to instantiate chunk structs.
func OpenBlock(logger log.Logger, dir string, pool chunkenc.Pool) (pb *Block, err error) {
if logger == nil {
logger = log.NewNopLogger()
}
var closers []io.Closer
defer func() {
if err != nil {
var merr tsdb_errors.MultiError
merr.Add(err)
merr.Add(closeAll(closers))
err = merr.Err()
}
}()
meta, sizeMeta, err := readMetaFile(dir)
if err != nil {
return nil, err
}
cr, err := chunks.NewDirReader(chunkDir(dir), pool)
if err != nil {
return nil, err
}
closers = append(closers, cr)
ir, err := index.NewFileReader(filepath.Join(dir, indexFilename))
if err != nil {
return nil, err
}
closers = append(closers, ir)
tr, sizeTomb, err := readTombstones(dir)
if err != nil {
return nil, err
}
closers = append(closers, tr)
pb = &Block{
dir: dir,
meta: *meta,
chunkr: cr,
indexr: ir,
tombstones: tr,
symbolTableSize: ir.SymbolTableSize(),
logger: logger,
numBytesChunks: cr.Size(),
numBytesIndex: ir.Size(),
numBytesTombstone: sizeTomb,
numBytesMeta: sizeMeta,
}
return pb, nil
}
// Close closes the on-disk block. It blocks as long as there are readers reading from the block.
func (pb *Block) Close() error {
pb.mtx.Lock()
pb.closing = true
pb.mtx.Unlock()
pb.pendingReaders.Wait()
var merr tsdb_errors.MultiError
merr.Add(pb.chunkr.Close())
merr.Add(pb.indexr.Close())
merr.Add(pb.tombstones.Close())
return merr.Err()
}
func (pb *Block) String() string {
return pb.meta.ULID.String()
}
// Dir returns the directory of the block.
func (pb *Block) Dir() string { return pb.dir }
// Meta returns meta information about the block.
func (pb *Block) Meta() BlockMeta { return pb.meta }
// MinTime returns the min time of the meta.
func (pb *Block) MinTime() int64 { return pb.meta.MinTime }
// MaxTime returns the max time of the meta.
func (pb *Block) MaxTime() int64 { return pb.meta.MaxTime }
// Size returns the number of bytes that the block takes up.
func (pb *Block) Size() int64 {
return pb.numBytesChunks + pb.numBytesIndex + pb.numBytesTombstone + pb.numBytesMeta
}
// ErrClosing is returned when a block is in the process of being closed.
var ErrClosing = errors.New("block is closing")
func (pb *Block) startRead() error {
pb.mtx.RLock()
defer pb.mtx.RUnlock()
if pb.closing {
return ErrClosing
}
pb.pendingReaders.Add(1)
return nil
}
// Index returns a new IndexReader against the block data.
func (pb *Block) Index() (IndexReader, error) {
if err := pb.startRead(); err != nil {
return nil, err
}
return blockIndexReader{ir: pb.indexr, b: pb}, nil
}
// Chunks returns a new ChunkReader against the block data.
func (pb *Block) Chunks() (ChunkReader, error) {
if err := pb.startRead(); err != nil {
return nil, err
}
return blockChunkReader{ChunkReader: pb.chunkr, b: pb}, nil
}
// Tombstones returns a new TombstoneReader against the block data.
func (pb *Block) Tombstones() (TombstoneReader, error) {
if err := pb.startRead(); err != nil {
return nil, err
}
return blockTombstoneReader{TombstoneReader: pb.tombstones, b: pb}, nil
}
// GetSymbolTableSize returns the Symbol Table Size in the index of this block.
func (pb *Block) GetSymbolTableSize() uint64 {
return pb.symbolTableSize
}
func (pb *Block) setCompactionFailed() error {
pb.meta.Compaction.Failed = true
n, err := writeMetaFile(pb.logger, pb.dir, &pb.meta)
if err != nil {
return err
}
pb.numBytesMeta = n
return nil
}
type blockIndexReader struct {
ir IndexReader
b *Block
}
func (r blockIndexReader) Symbols() (map[string]struct{}, error) {
s, err := r.ir.Symbols()
return s, errors.Wrapf(err, "block: %s", r.b.Meta().ULID)
}
func (r blockIndexReader) LabelValues(names ...string) (index.StringTuples, error) {
st, err := r.ir.LabelValues(names...)
return st, errors.Wrapf(err, "block: %s", r.b.Meta().ULID)
}
func (r blockIndexReader) Postings(name, value string) (index.Postings, error) {
p, err := r.ir.Postings(name, value)
if err != nil {
return p, errors.Wrapf(err, "block: %s", r.b.Meta().ULID)
}
return p, nil
}
func (r blockIndexReader) SortedPostings(p index.Postings) index.Postings {
return r.ir.SortedPostings(p)
}
func (r blockIndexReader) Series(ref uint64, lset *labels.Labels, chks *[]chunks.Meta) error {
if err := r.ir.Series(ref, lset, chks); err != nil {
return errors.Wrapf(err, "block: %s", r.b.Meta().ULID)
}
return nil
}
func (r blockIndexReader) LabelIndices() ([][]string, error) {
ss, err := r.ir.LabelIndices()
return ss, errors.Wrapf(err, "block: %s", r.b.Meta().ULID)
}
func (r blockIndexReader) LabelNames() ([]string, error) {
return r.b.LabelNames()
}
func (r blockIndexReader) Close() error {
r.b.pendingReaders.Done()
return nil
}
type blockTombstoneReader struct {
TombstoneReader
b *Block
}
func (r blockTombstoneReader) Close() error {
r.b.pendingReaders.Done()
return nil
}
type blockChunkReader struct {
ChunkReader
b *Block
}
func (r blockChunkReader) Close() error {
r.b.pendingReaders.Done()
return nil
}
// Delete matching series between mint and maxt in the block.
func (pb *Block) Delete(mint, maxt int64, ms ...labels.Matcher) error {
pb.mtx.Lock()
defer pb.mtx.Unlock()
if pb.closing {
return ErrClosing
}
p, err := PostingsForMatchers(pb.indexr, ms...)
if err != nil {
return errors.Wrap(err, "select series")
}
ir := pb.indexr
// Choose only valid postings which have chunks in the time-range.
stones := newMemTombstones()
var lset labels.Labels
var chks []chunks.Meta
Outer:
for p.Next() {
err := ir.Series(p.At(), &lset, &chks)
if err != nil {
return err
}
for _, chk := range chks {
if chk.OverlapsClosedInterval(mint, maxt) {
// Delete only until the current values and not beyond.
tmin, tmax := clampInterval(mint, maxt, chks[0].MinTime, chks[len(chks)-1].MaxTime)
stones.addInterval(p.At(), Interval{tmin, tmax})
continue Outer
}
}
}
if p.Err() != nil {
return p.Err()
}
err = pb.tombstones.Iter(func(id uint64, ivs Intervals) error {
for _, iv := range ivs {
stones.addInterval(id, iv)
}
return nil
})
if err != nil {
return err
}
pb.tombstones = stones
pb.meta.Stats.NumTombstones = pb.tombstones.Total()
n, err := writeTombstoneFile(pb.logger, pb.dir, pb.tombstones)
if err != nil {
return err
}
pb.numBytesTombstone = n
n, err = writeMetaFile(pb.logger, pb.dir, &pb.meta)
if err != nil {
return err
}
pb.numBytesMeta = n
return nil
}
// CleanTombstones will remove the tombstones and rewrite the block (only if there are any tombstones).
// If there was a rewrite, then it returns the ULID of the new block written, else nil.
func (pb *Block) CleanTombstones(dest string, c Compactor) (*ulid.ULID, error) {
numStones := 0
if err := pb.tombstones.Iter(func(id uint64, ivs Intervals) error {
numStones += len(ivs)
return nil
}); err != nil {
// This should never happen, as the iteration function only returns nil.
panic(err)
}
if numStones == 0 {
return nil, nil
}
meta := pb.Meta()
uid, err := c.Write(dest, pb, pb.meta.MinTime, pb.meta.MaxTime, &meta)
if err != nil {
return nil, err
}
return &uid, nil
}
// Snapshot creates snapshot of the block into dir.
func (pb *Block) Snapshot(dir string) error {
blockDir := filepath.Join(dir, pb.meta.ULID.String())
if err := os.MkdirAll(blockDir, 0777); err != nil {
return errors.Wrap(err, "create snapshot block dir")
}
chunksDir := chunkDir(blockDir)
if err := os.MkdirAll(chunksDir, 0777); err != nil {
return errors.Wrap(err, "create snapshot chunk dir")
}
// Hardlink meta, index and tombstones
for _, fname := range []string{
metaFilename,
indexFilename,
tombstoneFilename,
} {
if err := os.Link(filepath.Join(pb.dir, fname), filepath.Join(blockDir, fname)); err != nil {
return errors.Wrapf(err, "create snapshot %s", fname)
}
}
// Hardlink the chunks
curChunkDir := chunkDir(pb.dir)
files, err := ioutil.ReadDir(curChunkDir)
if err != nil {
return errors.Wrap(err, "ReadDir the current chunk dir")
}
for _, f := range files {
err := os.Link(filepath.Join(curChunkDir, f.Name()), filepath.Join(chunksDir, f.Name()))
if err != nil {
return errors.Wrap(err, "hardlink a chunk")
}
}
return nil
}
// OverlapsClosedInterval returns true if the block overlaps [mint, maxt].
func (pb *Block) OverlapsClosedInterval(mint, maxt int64) bool {
// The block itself is a half-open interval
// [pb.meta.MinTime, pb.meta.MaxTime).
return pb.meta.MinTime <= maxt && mint < pb.meta.MaxTime
}
// LabelNames returns all the unique label names present in the Block in sorted order.
func (pb *Block) LabelNames() ([]string, error) {
return pb.indexr.LabelNames()
}
func clampInterval(a, b, mint, maxt int64) (int64, int64) {
if a < mint {
a = mint
}
if b > maxt {
b = maxt
}
return a, b
}

295
tsdb/block_test.go Normal file
View file

@ -0,0 +1,295 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"context"
"encoding/binary"
"errors"
"io/ioutil"
"math/rand"
"os"
"path/filepath"
"strconv"
"testing"
"github.com/go-kit/kit/log"
"github.com/prometheus/tsdb/chunks"
"github.com/prometheus/tsdb/labels"
"github.com/prometheus/tsdb/testutil"
"github.com/prometheus/tsdb/tsdbutil"
)
// In Prometheus 2.1.0 we had a bug where the meta.json version was falsely bumped
// to 2. We had a migration in place resetting it to 1 but we should move immediately to
// version 3 next time to avoid confusion and issues.
func TestBlockMetaMustNeverBeVersion2(t *testing.T) {
dir, err := ioutil.TempDir("", "metaversion")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
_, err = writeMetaFile(log.NewNopLogger(), dir, &BlockMeta{})
testutil.Ok(t, err)
meta, _, err := readMetaFile(dir)
testutil.Ok(t, err)
testutil.Assert(t, meta.Version != 2, "meta.json version must never be 2")
}
func TestSetCompactionFailed(t *testing.T) {
tmpdir, err := ioutil.TempDir("", "test")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(tmpdir))
}()
blockDir := createBlock(t, tmpdir, genSeries(1, 1, 0, 1))
b, err := OpenBlock(nil, blockDir, nil)
testutil.Ok(t, err)
testutil.Equals(t, false, b.meta.Compaction.Failed)
testutil.Ok(t, b.setCompactionFailed())
testutil.Equals(t, true, b.meta.Compaction.Failed)
testutil.Ok(t, b.Close())
b, err = OpenBlock(nil, blockDir, nil)
testutil.Ok(t, err)
testutil.Equals(t, true, b.meta.Compaction.Failed)
testutil.Ok(t, b.Close())
}
func TestCreateBlock(t *testing.T) {
tmpdir, err := ioutil.TempDir("", "test")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(tmpdir))
}()
b, err := OpenBlock(nil, createBlock(t, tmpdir, genSeries(1, 1, 0, 10)), nil)
if err == nil {
testutil.Ok(t, b.Close())
}
testutil.Ok(t, err)
}
func TestCorruptedChunk(t *testing.T) {
for name, test := range map[string]struct {
corrFunc func(f *os.File) // Func that applies the corruption.
expErr error
}{
"invalid header size": {
func(f *os.File) {
err := f.Truncate(1)
testutil.Ok(t, err)
},
errors.New("invalid chunk header in segment 0: invalid size"),
},
"invalid magic number": {
func(f *os.File) {
magicChunksOffset := int64(0)
_, err := f.Seek(magicChunksOffset, 0)
testutil.Ok(t, err)
// Set invalid magic number.
b := make([]byte, chunks.MagicChunksSize)
binary.BigEndian.PutUint32(b[:chunks.MagicChunksSize], 0x00000000)
n, err := f.Write(b)
testutil.Ok(t, err)
testutil.Equals(t, chunks.MagicChunksSize, n)
},
errors.New("invalid magic number 0"),
},
"invalid chunk format version": {
func(f *os.File) {
chunksFormatVersionOffset := int64(4)
_, err := f.Seek(chunksFormatVersionOffset, 0)
testutil.Ok(t, err)
// Set invalid chunk format version.
b := make([]byte, chunks.ChunksFormatVersionSize)
b[0] = 0
n, err := f.Write(b)
testutil.Ok(t, err)
testutil.Equals(t, chunks.ChunksFormatVersionSize, n)
},
errors.New("invalid chunk format version 0"),
},
} {
t.Run(name, func(t *testing.T) {
tmpdir, err := ioutil.TempDir("", "test_open_block_chunk_corrupted")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(tmpdir))
}()
blockDir := createBlock(t, tmpdir, genSeries(1, 1, 0, 1))
files, err := sequenceFiles(chunkDir(blockDir))
testutil.Ok(t, err)
testutil.Assert(t, len(files) > 0, "No chunk created.")
f, err := os.OpenFile(files[0], os.O_RDWR, 0666)
testutil.Ok(t, err)
// Apply corruption function.
test.corrFunc(f)
testutil.Ok(t, f.Close())
_, err = OpenBlock(nil, blockDir, nil)
testutil.Equals(t, test.expErr.Error(), err.Error())
})
}
}
// TestBlockSize ensures that the block size is calculated correctly.
func TestBlockSize(t *testing.T) {
tmpdir, err := ioutil.TempDir("", "test_blockSize")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(tmpdir))
}()
var (
blockInit *Block
expSizeInit int64
blockDirInit string
)
// Create a block and compare the reported size vs actual disk size.
{
blockDirInit = createBlock(t, tmpdir, genSeries(10, 1, 1, 100))
blockInit, err = OpenBlock(nil, blockDirInit, nil)
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, blockInit.Close())
}()
expSizeInit = blockInit.Size()
actSizeInit := testutil.DirSize(t, blockInit.Dir())
testutil.Equals(t, expSizeInit, actSizeInit)
}
// Delete some series and check the sizes again.
{
testutil.Ok(t, blockInit.Delete(1, 10, labels.NewMustRegexpMatcher("", ".*")))
expAfterDelete := blockInit.Size()
testutil.Assert(t, expAfterDelete > expSizeInit, "after a delete the block size should be bigger as the tombstone file should grow %v > %v", expAfterDelete, expSizeInit)
actAfterDelete := testutil.DirSize(t, blockDirInit)
testutil.Ok(t, err)
testutil.Equals(t, expAfterDelete, actAfterDelete, "after a delete reported block size doesn't match actual disk size")
c, err := NewLeveledCompactor(context.Background(), nil, log.NewNopLogger(), []int64{0}, nil)
testutil.Ok(t, err)
blockDirAfterCompact, err := c.Compact(tmpdir, []string{blockInit.Dir()}, nil)
testutil.Ok(t, err)
blockAfterCompact, err := OpenBlock(nil, filepath.Join(tmpdir, blockDirAfterCompact.String()), nil)
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, blockAfterCompact.Close())
}()
expAfterCompact := blockAfterCompact.Size()
actAfterCompact := testutil.DirSize(t, blockAfterCompact.Dir())
testutil.Assert(t, actAfterDelete > actAfterCompact, "after a delete and compaction the block size should be smaller %v,%v", actAfterDelete, actAfterCompact)
testutil.Equals(t, expAfterCompact, actAfterCompact, "after a delete and compaction reported block size doesn't match actual disk size")
}
}
// createBlock creates a block with given set of series and returns its dir.
func createBlock(tb testing.TB, dir string, series []Series) string {
head := createHead(tb, series)
compactor, err := NewLeveledCompactor(context.Background(), nil, log.NewNopLogger(), []int64{1000000}, nil)
testutil.Ok(tb, err)
testutil.Ok(tb, os.MkdirAll(dir, 0777))
// Add +1 millisecond to block maxt because block intervals are half-open: [b.MinTime, b.MaxTime).
// Because of this block intervals are always +1 than the total samples it includes.
ulid, err := compactor.Write(dir, head, head.MinTime(), head.MaxTime()+1, nil)
testutil.Ok(tb, err)
return filepath.Join(dir, ulid.String())
}
func createHead(tb testing.TB, series []Series) *Head {
head, err := NewHead(nil, nil, nil, 2*60*60*1000)
testutil.Ok(tb, err)
defer head.Close()
app := head.Appender()
for _, s := range series {
ref := uint64(0)
it := s.Iterator()
for it.Next() {
t, v := it.At()
if ref != 0 {
err := app.AddFast(ref, t, v)
if err == nil {
continue
}
}
ref, err = app.Add(s.Labels(), t, v)
testutil.Ok(tb, err)
}
testutil.Ok(tb, it.Err())
}
err = app.Commit()
testutil.Ok(tb, err)
return head
}
const (
defaultLabelName = "labelName"
defaultLabelValue = "labelValue"
)
// genSeries generates series with a given number of labels and values.
func genSeries(totalSeries, labelCount int, mint, maxt int64) []Series {
if totalSeries == 0 || labelCount == 0 {
return nil
}
series := make([]Series, totalSeries)
for i := 0; i < totalSeries; i++ {
lbls := make(map[string]string, labelCount)
lbls[defaultLabelName] = strconv.Itoa(i)
for j := 1; len(lbls) < labelCount; j++ {
lbls[defaultLabelName+strconv.Itoa(j)] = defaultLabelValue + strconv.Itoa(j)
}
samples := make([]tsdbutil.Sample, 0, maxt-mint+1)
for t := mint; t < maxt; t++ {
samples = append(samples, sample{t: t, v: rand.Float64()})
}
series[i] = newSeries(lbls, samples)
}
return series
}
// populateSeries generates series from given labels, mint and maxt.
func populateSeries(lbls []map[string]string, mint, maxt int64) []Series {
if len(lbls) == 0 {
return nil
}
series := make([]Series, 0, len(lbls))
for _, lbl := range lbls {
if len(lbl) == 0 {
continue
}
samples := make([]tsdbutil.Sample, 0, maxt-mint+1)
for t := mint; t <= maxt; t++ {
samples = append(samples, sample{t: t, v: rand.Float64()})
}
series = append(series, newSeries(lbl, samples))
}
return series
}

261
tsdb/checkpoint.go Normal file
View file

@ -0,0 +1,261 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"fmt"
"io"
"io/ioutil"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/pkg/errors"
tsdb_errors "github.com/prometheus/tsdb/errors"
"github.com/prometheus/tsdb/fileutil"
"github.com/prometheus/tsdb/wal"
)
// CheckpointStats returns stats about a created checkpoint.
type CheckpointStats struct {
DroppedSeries int
DroppedSamples int
DroppedTombstones int
TotalSeries int // Processed series including dropped ones.
TotalSamples int // Processed samples including dropped ones.
TotalTombstones int // Processed tombstones including dropped ones.
}
// LastCheckpoint returns the directory name and index of the most recent checkpoint.
// If dir does not contain any checkpoints, ErrNotFound is returned.
func LastCheckpoint(dir string) (string, int, error) {
files, err := ioutil.ReadDir(dir)
if err != nil {
return "", 0, err
}
// Traverse list backwards since there may be multiple checkpoints left.
for i := len(files) - 1; i >= 0; i-- {
fi := files[i]
if !strings.HasPrefix(fi.Name(), checkpointPrefix) {
continue
}
if !fi.IsDir() {
return "", 0, errors.Errorf("checkpoint %s is not a directory", fi.Name())
}
idx, err := strconv.Atoi(fi.Name()[len(checkpointPrefix):])
if err != nil {
continue
}
return filepath.Join(dir, fi.Name()), idx, nil
}
return "", 0, ErrNotFound
}
// DeleteCheckpoints deletes all checkpoints in a directory below a given index.
func DeleteCheckpoints(dir string, maxIndex int) error {
var errs tsdb_errors.MultiError
files, err := ioutil.ReadDir(dir)
if err != nil {
return err
}
for _, fi := range files {
if !strings.HasPrefix(fi.Name(), checkpointPrefix) {
continue
}
index, err := strconv.Atoi(fi.Name()[len(checkpointPrefix):])
if err != nil || index >= maxIndex {
continue
}
if err := os.RemoveAll(filepath.Join(dir, fi.Name())); err != nil {
errs.Add(err)
}
}
return errs.Err()
}
const checkpointPrefix = "checkpoint."
// Checkpoint creates a compacted checkpoint of segments in range [first, last] in the given WAL.
// It includes the most recent checkpoint if it exists.
// All series not satisfying keep and samples below mint are dropped.
//
// The checkpoint is stored in a directory named checkpoint.N in the same
// segmented format as the original WAL itself.
// This makes it easy to read it through the WAL package and concatenate
// it with the original WAL.
func Checkpoint(w *wal.WAL, from, to int, keep func(id uint64) bool, mint int64) (*CheckpointStats, error) {
stats := &CheckpointStats{}
var sgmReader io.ReadCloser
{
var sgmRange []wal.SegmentRange
dir, idx, err := LastCheckpoint(w.Dir())
if err != nil && err != ErrNotFound {
return nil, errors.Wrap(err, "find last checkpoint")
}
last := idx + 1
if err == nil {
if from > last {
return nil, fmt.Errorf("unexpected gap to last checkpoint. expected:%v, requested:%v", last, from)
}
// Ignore WAL files below the checkpoint. They shouldn't exist to begin with.
from = last
sgmRange = append(sgmRange, wal.SegmentRange{Dir: dir, Last: math.MaxInt32})
}
sgmRange = append(sgmRange, wal.SegmentRange{Dir: w.Dir(), First: from, Last: to})
sgmReader, err = wal.NewSegmentsRangeReader(sgmRange...)
if err != nil {
return nil, errors.Wrap(err, "create segment reader")
}
defer sgmReader.Close()
}
cpdir := filepath.Join(w.Dir(), fmt.Sprintf(checkpointPrefix+"%06d", to))
cpdirtmp := cpdir + ".tmp"
if err := os.MkdirAll(cpdirtmp, 0777); err != nil {
return nil, errors.Wrap(err, "create checkpoint dir")
}
cp, err := wal.New(nil, nil, cpdirtmp, w.CompressionEnabled())
if err != nil {
return nil, errors.Wrap(err, "open checkpoint")
}
// Ensures that an early return caused by an error doesn't leave any tmp files.
defer func() {
cp.Close()
os.RemoveAll(cpdirtmp)
}()
r := wal.NewReader(sgmReader)
var (
series []RefSeries
samples []RefSample
tstones []Stone
dec RecordDecoder
enc RecordEncoder
buf []byte
recs [][]byte
)
for r.Next() {
series, samples, tstones = series[:0], samples[:0], tstones[:0]
// We don't reset the buffer since we batch up multiple records
// before writing them to the checkpoint.
// Remember where the record for this iteration starts.
start := len(buf)
rec := r.Record()
switch dec.Type(rec) {
case RecordSeries:
series, err = dec.Series(rec, series)
if err != nil {
return nil, errors.Wrap(err, "decode series")
}
// Drop irrelevant series in place.
repl := series[:0]
for _, s := range series {
if keep(s.Ref) {
repl = append(repl, s)
}
}
if len(repl) > 0 {
buf = enc.Series(repl, buf)
}
stats.TotalSeries += len(series)
stats.DroppedSeries += len(series) - len(repl)
case RecordSamples:
samples, err = dec.Samples(rec, samples)
if err != nil {
return nil, errors.Wrap(err, "decode samples")
}
// Drop irrelevant samples in place.
repl := samples[:0]
for _, s := range samples {
if s.T >= mint {
repl = append(repl, s)
}
}
if len(repl) > 0 {
buf = enc.Samples(repl, buf)
}
stats.TotalSamples += len(samples)
stats.DroppedSamples += len(samples) - len(repl)
case RecordTombstones:
tstones, err = dec.Tombstones(rec, tstones)
if err != nil {
return nil, errors.Wrap(err, "decode deletes")
}
// Drop irrelevant tombstones in place.
repl := tstones[:0]
for _, s := range tstones {
for _, iv := range s.intervals {
if iv.Maxt >= mint {
repl = append(repl, s)
break
}
}
}
if len(repl) > 0 {
buf = enc.Tombstones(repl, buf)
}
stats.TotalTombstones += len(tstones)
stats.DroppedTombstones += len(tstones) - len(repl)
default:
return nil, errors.New("invalid record type")
}
if len(buf[start:]) == 0 {
continue // All contents discarded.
}
recs = append(recs, buf[start:])
// Flush records in 1 MB increments.
if len(buf) > 1*1024*1024 {
if err := cp.Log(recs...); err != nil {
return nil, errors.Wrap(err, "flush records")
}
buf, recs = buf[:0], recs[:0]
}
}
// If we hit any corruption during checkpointing, repairing is not an option.
// The head won't know which series records are lost.
if r.Err() != nil {
return nil, errors.Wrap(r.Err(), "read segments")
}
// Flush remaining records.
if err := cp.Log(recs...); err != nil {
return nil, errors.Wrap(err, "flush records")
}
if err := cp.Close(); err != nil {
return nil, errors.Wrap(err, "close checkpoint")
}
if err := fileutil.Replace(cpdirtmp, cpdir); err != nil {
return nil, errors.Wrap(err, "rename checkpoint directory")
}
return stats, nil
}

224
tsdb/checkpoint_test.go Normal file
View file

@ -0,0 +1,224 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strings"
"testing"
"github.com/pkg/errors"
"github.com/prometheus/tsdb/fileutil"
"github.com/prometheus/tsdb/labels"
"github.com/prometheus/tsdb/testutil"
"github.com/prometheus/tsdb/wal"
)
func TestLastCheckpoint(t *testing.T) {
dir, err := ioutil.TempDir("", "test_checkpoint")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
_, _, err = LastCheckpoint(dir)
testutil.Equals(t, ErrNotFound, err)
testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.0000"), 0777))
s, k, err := LastCheckpoint(dir)
testutil.Ok(t, err)
testutil.Equals(t, filepath.Join(dir, "checkpoint.0000"), s)
testutil.Equals(t, 0, k)
testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.xyz"), 0777))
s, k, err = LastCheckpoint(dir)
testutil.Ok(t, err)
testutil.Equals(t, filepath.Join(dir, "checkpoint.0000"), s)
testutil.Equals(t, 0, k)
testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.1"), 0777))
s, k, err = LastCheckpoint(dir)
testutil.Ok(t, err)
testutil.Equals(t, filepath.Join(dir, "checkpoint.1"), s)
testutil.Equals(t, 1, k)
testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.1000"), 0777))
s, k, err = LastCheckpoint(dir)
testutil.Ok(t, err)
testutil.Equals(t, filepath.Join(dir, "checkpoint.1000"), s)
testutil.Equals(t, 1000, k)
}
func TestDeleteCheckpoints(t *testing.T) {
dir, err := ioutil.TempDir("", "test_checkpoint")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
testutil.Ok(t, DeleteCheckpoints(dir, 0))
testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.00"), 0777))
testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.01"), 0777))
testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.02"), 0777))
testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.03"), 0777))
testutil.Ok(t, DeleteCheckpoints(dir, 2))
files, err := fileutil.ReadDir(dir)
testutil.Ok(t, err)
testutil.Equals(t, []string{"checkpoint.02", "checkpoint.03"}, files)
}
func TestCheckpoint(t *testing.T) {
for _, compress := range []bool{false, true} {
t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) {
dir, err := ioutil.TempDir("", "test_checkpoint")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
var enc RecordEncoder
// Create a dummy segment to bump the initial number.
seg, err := wal.CreateSegment(dir, 100)
testutil.Ok(t, err)
testutil.Ok(t, seg.Close())
// Manually create checkpoint for 99 and earlier.
w, err := wal.New(nil, nil, filepath.Join(dir, "checkpoint.0099"), compress)
testutil.Ok(t, err)
// Add some data we expect to be around later.
err = w.Log(enc.Series([]RefSeries{
{Ref: 0, Labels: labels.FromStrings("a", "b", "c", "0")},
{Ref: 1, Labels: labels.FromStrings("a", "b", "c", "1")},
}, nil))
testutil.Ok(t, err)
testutil.Ok(t, w.Close())
// Start a WAL and write records to it as usual.
w, err = wal.NewSize(nil, nil, dir, 64*1024, compress)
testutil.Ok(t, err)
var last int64
for i := 0; ; i++ {
_, n, err := w.Segments()
testutil.Ok(t, err)
if n >= 106 {
break
}
// Write some series initially.
if i == 0 {
b := enc.Series([]RefSeries{
{Ref: 2, Labels: labels.FromStrings("a", "b", "c", "2")},
{Ref: 3, Labels: labels.FromStrings("a", "b", "c", "3")},
{Ref: 4, Labels: labels.FromStrings("a", "b", "c", "4")},
{Ref: 5, Labels: labels.FromStrings("a", "b", "c", "5")},
}, nil)
testutil.Ok(t, w.Log(b))
}
// Write samples until the WAL has enough segments.
// Make them have drifting timestamps within a record to see that they
// get filtered properly.
b := enc.Samples([]RefSample{
{Ref: 0, T: last, V: float64(i)},
{Ref: 1, T: last + 10000, V: float64(i)},
{Ref: 2, T: last + 20000, V: float64(i)},
{Ref: 3, T: last + 30000, V: float64(i)},
}, nil)
testutil.Ok(t, w.Log(b))
last += 100
}
testutil.Ok(t, w.Close())
_, err = Checkpoint(w, 100, 106, func(x uint64) bool {
return x%2 == 0
}, last/2)
testutil.Ok(t, err)
testutil.Ok(t, w.Truncate(107))
testutil.Ok(t, DeleteCheckpoints(w.Dir(), 106))
// Only the new checkpoint should be left.
files, err := fileutil.ReadDir(dir)
testutil.Ok(t, err)
testutil.Equals(t, 1, len(files))
testutil.Equals(t, "checkpoint.000106", files[0])
sr, err := wal.NewSegmentsReader(filepath.Join(dir, "checkpoint.000106"))
testutil.Ok(t, err)
defer sr.Close()
var dec RecordDecoder
var series []RefSeries
r := wal.NewReader(sr)
for r.Next() {
rec := r.Record()
switch dec.Type(rec) {
case RecordSeries:
series, err = dec.Series(rec, series)
testutil.Ok(t, err)
case RecordSamples:
samples, err := dec.Samples(rec, nil)
testutil.Ok(t, err)
for _, s := range samples {
testutil.Assert(t, s.T >= last/2, "sample with wrong timestamp")
}
}
}
testutil.Ok(t, r.Err())
testutil.Equals(t, []RefSeries{
{Ref: 0, Labels: labels.FromStrings("a", "b", "c", "0")},
{Ref: 2, Labels: labels.FromStrings("a", "b", "c", "2")},
{Ref: 4, Labels: labels.FromStrings("a", "b", "c", "4")},
}, series)
})
}
}
func TestCheckpointNoTmpFolderAfterError(t *testing.T) {
// Create a new wal with an invalid records.
dir, err := ioutil.TempDir("", "test_checkpoint")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
w, err := wal.NewSize(nil, nil, dir, 64*1024, false)
testutil.Ok(t, err)
testutil.Ok(t, w.Log([]byte{99}))
w.Close()
// Run the checkpoint and since the wal contains an invalid records this should return an error.
_, err = Checkpoint(w, 0, 1, nil, 0)
testutil.NotOk(t, err)
// Walk the wal dir to make sure there are no tmp folder left behind after the error.
err = filepath.Walk(w.Dir(), func(path string, info os.FileInfo, err error) error {
if err != nil {
return errors.Wrapf(err, "access err %q: %v\n", path, err)
}
if info.IsDir() && strings.HasSuffix(info.Name(), ".tmp") {
return fmt.Errorf("wal dir contains temporary folder:%s", info.Name())
}
return nil
})
testutil.Ok(t, err)
}

200
tsdb/chunkenc/bstream.go Normal file
View file

@ -0,0 +1,200 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// The code in this file was largely written by Damian Gryski as part of
// https://github.com/dgryski/go-tsz and published under the license below.
// It received minor modifications to suit Prometheus's needs.
// Copyright (c) 2015,2016 Damian Gryski <damian@gryski.com>
// All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package chunkenc
import "io"
// bstream is a stream of bits.
type bstream struct {
stream []byte // the data stream
count uint8 // how many bits are valid in current byte
}
func newBReader(b []byte) bstream {
return bstream{stream: b, count: 8}
}
func (b *bstream) bytes() []byte {
return b.stream
}
type bit bool
const (
zero bit = false
one bit = true
)
func (b *bstream) writeBit(bit bit) {
if b.count == 0 {
b.stream = append(b.stream, 0)
b.count = 8
}
i := len(b.stream) - 1
if bit {
b.stream[i] |= 1 << (b.count - 1)
}
b.count--
}
func (b *bstream) writeByte(byt byte) {
if b.count == 0 {
b.stream = append(b.stream, 0)
b.count = 8
}
i := len(b.stream) - 1
// fill up b.b with b.count bits from byt
b.stream[i] |= byt >> (8 - b.count)
b.stream = append(b.stream, 0)
i++
b.stream[i] = byt << b.count
}
func (b *bstream) writeBits(u uint64, nbits int) {
u <<= (64 - uint(nbits))
for nbits >= 8 {
byt := byte(u >> 56)
b.writeByte(byt)
u <<= 8
nbits -= 8
}
for nbits > 0 {
b.writeBit((u >> 63) == 1)
u <<= 1
nbits--
}
}
func (b *bstream) readBit() (bit, error) {
if len(b.stream) == 0 {
return false, io.EOF
}
if b.count == 0 {
b.stream = b.stream[1:]
if len(b.stream) == 0 {
return false, io.EOF
}
b.count = 8
}
d := (b.stream[0] << (8 - b.count)) & 0x80
b.count--
return d != 0, nil
}
func (b *bstream) ReadByte() (byte, error) {
return b.readByte()
}
func (b *bstream) readByte() (byte, error) {
if len(b.stream) == 0 {
return 0, io.EOF
}
if b.count == 0 {
b.stream = b.stream[1:]
if len(b.stream) == 0 {
return 0, io.EOF
}
return b.stream[0], nil
}
if b.count == 8 {
b.count = 0
return b.stream[0], nil
}
byt := b.stream[0] << (8 - b.count)
b.stream = b.stream[1:]
if len(b.stream) == 0 {
return 0, io.EOF
}
// We just advanced the stream and can assume the shift to be 0.
byt |= b.stream[0] >> b.count
return byt, nil
}
func (b *bstream) readBits(nbits int) (uint64, error) {
var u uint64
for nbits >= 8 {
byt, err := b.readByte()
if err != nil {
return 0, err
}
u = (u << 8) | uint64(byt)
nbits -= 8
}
if nbits == 0 {
return u, nil
}
if nbits > int(b.count) {
u = (u << uint(b.count)) | uint64((b.stream[0]<<(8-b.count))>>(8-b.count))
nbits -= int(b.count)
b.stream = b.stream[1:]
if len(b.stream) == 0 {
return 0, io.EOF
}
b.count = 8
}
u = (u << uint(nbits)) | uint64((b.stream[0]<<(8-b.count))>>(8-uint(nbits)))
b.count -= uint8(nbits)
return u, nil
}

138
tsdb/chunkenc/chunk.go Normal file
View file

@ -0,0 +1,138 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package chunkenc
import (
"fmt"
"sync"
"github.com/pkg/errors"
)
// Encoding is the identifier for a chunk encoding.
type Encoding uint8
func (e Encoding) String() string {
switch e {
case EncNone:
return "none"
case EncXOR:
return "XOR"
}
return "<unknown>"
}
// The different available chunk encodings.
const (
EncNone Encoding = iota
EncXOR
)
// Chunk holds a sequence of sample pairs that can be iterated over and appended to.
type Chunk interface {
Bytes() []byte
Encoding() Encoding
Appender() (Appender, error)
// The iterator passed as argument is for re-use.
// Depending on implementation, the iterator can
// be re-used or a new iterator can be allocated.
Iterator(Iterator) Iterator
NumSamples() int
}
// Appender adds sample pairs to a chunk.
type Appender interface {
Append(int64, float64)
}
// Iterator is a simple iterator that can only get the next value.
type Iterator interface {
At() (int64, float64)
Err() error
Next() bool
}
// NewNopIterator returns a new chunk iterator that does not hold any data.
func NewNopIterator() Iterator {
return nopIterator{}
}
type nopIterator struct{}
func (nopIterator) At() (int64, float64) { return 0, 0 }
func (nopIterator) Next() bool { return false }
func (nopIterator) Err() error { return nil }
// Pool is used to create and reuse chunk references to avoid allocations.
type Pool interface {
Put(Chunk) error
Get(e Encoding, b []byte) (Chunk, error)
}
// pool is a memory pool of chunk objects.
type pool struct {
xor sync.Pool
}
// NewPool returns a new pool.
func NewPool() Pool {
return &pool{
xor: sync.Pool{
New: func() interface{} {
return &XORChunk{b: bstream{}}
},
},
}
}
func (p *pool) Get(e Encoding, b []byte) (Chunk, error) {
switch e {
case EncXOR:
c := p.xor.Get().(*XORChunk)
c.b.stream = b
c.b.count = 0
return c, nil
}
return nil, errors.Errorf("invalid encoding %q", e)
}
func (p *pool) Put(c Chunk) error {
switch c.Encoding() {
case EncXOR:
xc, ok := c.(*XORChunk)
// This may happen often with wrapped chunks. Nothing we can really do about
// it but returning an error would cause a lot of allocations again. Thus,
// we just skip it.
if !ok {
return nil
}
xc.b.stream = nil
xc.b.count = 0
p.xor.Put(c)
default:
return errors.Errorf("invalid encoding %q", c.Encoding())
}
return nil
}
// FromData returns a chunk from a byte slice of chunk data.
// This is there so that users of the library can easily create chunks from
// bytes.
func FromData(e Encoding, d []byte) (Chunk, error) {
switch e {
case EncXOR:
return &XORChunk{b: bstream{count: 0, stream: d}}, nil
}
return nil, fmt.Errorf("unknown chunk encoding: %d", e)
}

202
tsdb/chunkenc/chunk_test.go Normal file
View file

@ -0,0 +1,202 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package chunkenc
import (
"fmt"
"io"
"math/rand"
"reflect"
"testing"
"github.com/prometheus/tsdb/testutil"
)
type pair struct {
t int64
v float64
}
func TestChunk(t *testing.T) {
for enc, nc := range map[Encoding]func() Chunk{
EncXOR: func() Chunk { return NewXORChunk() },
} {
t.Run(fmt.Sprintf("%v", enc), func(t *testing.T) {
for range make([]struct{}, 1) {
c := nc()
if err := testChunk(c); err != nil {
t.Fatal(err)
}
}
})
}
}
func testChunk(c Chunk) error {
app, err := c.Appender()
if err != nil {
return err
}
var exp []pair
var (
ts = int64(1234123324)
v = 1243535.123
)
for i := 0; i < 300; i++ {
ts += int64(rand.Intn(10000) + 1)
// v = rand.Float64()
if i%2 == 0 {
v += float64(rand.Intn(1000000))
} else {
v -= float64(rand.Intn(1000000))
}
// Start with a new appender every 10th sample. This emulates starting
// appending to a partially filled chunk.
if i%10 == 0 {
app, err = c.Appender()
if err != nil {
return err
}
}
app.Append(ts, v)
exp = append(exp, pair{t: ts, v: v})
// fmt.Println("appended", len(c.Bytes()), c.Bytes())
}
it := c.Iterator(nil)
var res []pair
for it.Next() {
ts, v := it.At()
res = append(res, pair{t: ts, v: v})
}
if it.Err() != nil {
return it.Err()
}
if !reflect.DeepEqual(exp, res) {
return fmt.Errorf("unexpected result\n\ngot: %v\n\nexp: %v", res, exp)
}
return nil
}
func benchmarkIterator(b *testing.B, newChunk func() Chunk) {
var (
t = int64(1234123324)
v = 1243535.123
)
var exp []pair
for i := 0; i < b.N; i++ {
// t += int64(rand.Intn(10000) + 1)
t += int64(1000)
// v = rand.Float64()
v += float64(100)
exp = append(exp, pair{t: t, v: v})
}
var chunks []Chunk
for i := 0; i < b.N; {
c := newChunk()
a, err := c.Appender()
if err != nil {
b.Fatalf("get appender: %s", err)
}
j := 0
for _, p := range exp {
if j > 250 {
break
}
a.Append(p.t, p.v)
i++
j++
}
chunks = append(chunks, c)
}
b.ReportAllocs()
b.ResetTimer()
fmt.Println("num", b.N, "created chunks", len(chunks))
res := make([]float64, 0, 1024)
var it Iterator
for i := 0; i < len(chunks); i++ {
c := chunks[i]
it := c.Iterator(it)
for it.Next() {
_, v := it.At()
res = append(res, v)
}
if it.Err() != io.EOF {
testutil.Ok(b, it.Err())
}
res = res[:0]
}
}
func BenchmarkXORIterator(b *testing.B) {
benchmarkIterator(b, func() Chunk {
return NewXORChunk()
})
}
func BenchmarkXORAppender(b *testing.B) {
benchmarkAppender(b, func() Chunk {
return NewXORChunk()
})
}
func benchmarkAppender(b *testing.B, newChunk func() Chunk) {
var (
t = int64(1234123324)
v = 1243535.123
)
var exp []pair
for i := 0; i < b.N; i++ {
// t += int64(rand.Intn(10000) + 1)
t += int64(1000)
// v = rand.Float64()
v += float64(100)
exp = append(exp, pair{t: t, v: v})
}
b.ReportAllocs()
b.ResetTimer()
var chunks []Chunk
for i := 0; i < b.N; {
c := newChunk()
a, err := c.Appender()
if err != nil {
b.Fatalf("get appender: %s", err)
}
j := 0
for _, p := range exp {
if j > 250 {
break
}
a.Append(p.t, p.v)
i++
j++
}
chunks = append(chunks, c)
}
fmt.Println("num", b.N, "created chunks", len(chunks))
}

407
tsdb/chunkenc/xor.go Normal file
View file

@ -0,0 +1,407 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// The code in this file was largely written by Damian Gryski as part of
// https://github.com/dgryski/go-tsz and published under the license below.
// It was modified to accommodate reading from byte slices without modifying
// the underlying bytes, which would panic when reading from mmaped
// read-only byte slices.
// Copyright (c) 2015,2016 Damian Gryski <damian@gryski.com>
// All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package chunkenc
import (
"encoding/binary"
"math"
"math/bits"
)
// XORChunk holds XOR encoded sample data.
type XORChunk struct {
b bstream
}
// NewXORChunk returns a new chunk with XOR encoding of the given size.
func NewXORChunk() *XORChunk {
b := make([]byte, 2, 128)
return &XORChunk{b: bstream{stream: b, count: 0}}
}
// Encoding returns the encoding type.
func (c *XORChunk) Encoding() Encoding {
return EncXOR
}
// Bytes returns the underlying byte slice of the chunk.
func (c *XORChunk) Bytes() []byte {
return c.b.bytes()
}
// NumSamples returns the number of samples in the chunk.
func (c *XORChunk) NumSamples() int {
return int(binary.BigEndian.Uint16(c.Bytes()))
}
// Appender implements the Chunk interface.
func (c *XORChunk) Appender() (Appender, error) {
it := c.iterator(nil)
// To get an appender we must know the state it would have if we had
// appended all existing data from scratch.
// We iterate through the end and populate via the iterator's state.
for it.Next() {
}
if err := it.Err(); err != nil {
return nil, err
}
a := &xorAppender{
b: &c.b,
t: it.t,
v: it.val,
tDelta: it.tDelta,
leading: it.leading,
trailing: it.trailing,
}
if binary.BigEndian.Uint16(a.b.bytes()) == 0 {
a.leading = 0xff
}
return a, nil
}
func (c *XORChunk) iterator(it Iterator) *xorIterator {
// Should iterators guarantee to act on a copy of the data so it doesn't lock append?
// When using striped locks to guard access to chunks, probably yes.
// Could only copy data if the chunk is not completed yet.
if xorIter, ok := it.(*xorIterator); ok {
xorIter.Reset(c.b.bytes())
return xorIter
}
return &xorIterator{
// The first 2 bytes contain chunk headers.
// We skip that for actual samples.
br: newBReader(c.b.bytes()[2:]),
numTotal: binary.BigEndian.Uint16(c.b.bytes()),
}
}
// Iterator implements the Chunk interface.
func (c *XORChunk) Iterator(it Iterator) Iterator {
return c.iterator(it)
}
type xorAppender struct {
b *bstream
t int64
v float64
tDelta uint64
leading uint8
trailing uint8
}
func (a *xorAppender) Append(t int64, v float64) {
var tDelta uint64
num := binary.BigEndian.Uint16(a.b.bytes())
if num == 0 {
buf := make([]byte, binary.MaxVarintLen64)
for _, b := range buf[:binary.PutVarint(buf, t)] {
a.b.writeByte(b)
}
a.b.writeBits(math.Float64bits(v), 64)
} else if num == 1 {
tDelta = uint64(t - a.t)
buf := make([]byte, binary.MaxVarintLen64)
for _, b := range buf[:binary.PutUvarint(buf, tDelta)] {
a.b.writeByte(b)
}
a.writeVDelta(v)
} else {
tDelta = uint64(t - a.t)
dod := int64(tDelta - a.tDelta)
// Gorilla has a max resolution of seconds, Prometheus milliseconds.
// Thus we use higher value range steps with larger bit size.
switch {
case dod == 0:
a.b.writeBit(zero)
case bitRange(dod, 14):
a.b.writeBits(0x02, 2) // '10'
a.b.writeBits(uint64(dod), 14)
case bitRange(dod, 17):
a.b.writeBits(0x06, 3) // '110'
a.b.writeBits(uint64(dod), 17)
case bitRange(dod, 20):
a.b.writeBits(0x0e, 4) // '1110'
a.b.writeBits(uint64(dod), 20)
default:
a.b.writeBits(0x0f, 4) // '1111'
a.b.writeBits(uint64(dod), 64)
}
a.writeVDelta(v)
}
a.t = t
a.v = v
binary.BigEndian.PutUint16(a.b.bytes(), num+1)
a.tDelta = tDelta
}
func bitRange(x int64, nbits uint8) bool {
return -((1<<(nbits-1))-1) <= x && x <= 1<<(nbits-1)
}
func (a *xorAppender) writeVDelta(v float64) {
vDelta := math.Float64bits(v) ^ math.Float64bits(a.v)
if vDelta == 0 {
a.b.writeBit(zero)
return
}
a.b.writeBit(one)
leading := uint8(bits.LeadingZeros64(vDelta))
trailing := uint8(bits.TrailingZeros64(vDelta))
// Clamp number of leading zeros to avoid overflow when encoding.
if leading >= 32 {
leading = 31
}
if a.leading != 0xff && leading >= a.leading && trailing >= a.trailing {
a.b.writeBit(zero)
a.b.writeBits(vDelta>>a.trailing, 64-int(a.leading)-int(a.trailing))
} else {
a.leading, a.trailing = leading, trailing
a.b.writeBit(one)
a.b.writeBits(uint64(leading), 5)
// Note that if leading == trailing == 0, then sigbits == 64. But that value doesn't actually fit into the 6 bits we have.
// Luckily, we never need to encode 0 significant bits, since that would put us in the other case (vdelta == 0).
// So instead we write out a 0 and adjust it back to 64 on unpacking.
sigbits := 64 - leading - trailing
a.b.writeBits(uint64(sigbits), 6)
a.b.writeBits(vDelta>>trailing, int(sigbits))
}
}
type xorIterator struct {
br bstream
numTotal uint16
numRead uint16
t int64
val float64
leading uint8
trailing uint8
tDelta uint64
err error
}
func (it *xorIterator) At() (int64, float64) {
return it.t, it.val
}
func (it *xorIterator) Err() error {
return it.err
}
func (it *xorIterator) Reset(b []byte) {
// The first 2 bytes contain chunk headers.
// We skip that for actual samples.
it.br = newBReader(b[2:])
it.numTotal = binary.BigEndian.Uint16(b)
it.numRead = 0
it.t = 0
it.val = 0
it.leading = 0
it.trailing = 0
it.tDelta = 0
it.err = nil
}
func (it *xorIterator) Next() bool {
if it.err != nil || it.numRead == it.numTotal {
return false
}
if it.numRead == 0 {
t, err := binary.ReadVarint(&it.br)
if err != nil {
it.err = err
return false
}
v, err := it.br.readBits(64)
if err != nil {
it.err = err
return false
}
it.t = t
it.val = math.Float64frombits(v)
it.numRead++
return true
}
if it.numRead == 1 {
tDelta, err := binary.ReadUvarint(&it.br)
if err != nil {
it.err = err
return false
}
it.tDelta = tDelta
it.t = it.t + int64(it.tDelta)
return it.readValue()
}
var d byte
// read delta-of-delta
for i := 0; i < 4; i++ {
d <<= 1
bit, err := it.br.readBit()
if err != nil {
it.err = err
return false
}
if bit == zero {
break
}
d |= 1
}
var sz uint8
var dod int64
switch d {
case 0x00:
// dod == 0
case 0x02:
sz = 14
case 0x06:
sz = 17
case 0x0e:
sz = 20
case 0x0f:
bits, err := it.br.readBits(64)
if err != nil {
it.err = err
return false
}
dod = int64(bits)
}
if sz != 0 {
bits, err := it.br.readBits(int(sz))
if err != nil {
it.err = err
return false
}
if bits > (1 << (sz - 1)) {
// or something
bits = bits - (1 << sz)
}
dod = int64(bits)
}
it.tDelta = uint64(int64(it.tDelta) + dod)
it.t = it.t + int64(it.tDelta)
return it.readValue()
}
func (it *xorIterator) readValue() bool {
bit, err := it.br.readBit()
if err != nil {
it.err = err
return false
}
if bit == zero {
// it.val = it.val
} else {
bit, err := it.br.readBit()
if err != nil {
it.err = err
return false
}
if bit == zero {
// reuse leading/trailing zero bits
// it.leading, it.trailing = it.leading, it.trailing
} else {
bits, err := it.br.readBits(5)
if err != nil {
it.err = err
return false
}
it.leading = uint8(bits)
bits, err = it.br.readBits(6)
if err != nil {
it.err = err
return false
}
mbits := uint8(bits)
// 0 significant bits here means we overflowed and we actually need 64; see comment in encoder
if mbits == 0 {
mbits = 64
}
it.trailing = 64 - it.leading - mbits
}
mbits := int(64 - it.leading - it.trailing)
bits, err := it.br.readBits(mbits)
if err != nil {
it.err = err
return false
}
vbits := math.Float64bits(it.val)
vbits ^= (bits << it.trailing)
it.val = math.Float64frombits(vbits)
}
it.numRead++
return true
}

512
tsdb/chunks/chunks.go Normal file
View file

@ -0,0 +1,512 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package chunks
import (
"bufio"
"encoding/binary"
"fmt"
"hash"
"hash/crc32"
"io"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"github.com/pkg/errors"
"github.com/prometheus/tsdb/chunkenc"
tsdb_errors "github.com/prometheus/tsdb/errors"
"github.com/prometheus/tsdb/fileutil"
)
const (
// MagicChunks is 4 bytes at the head of a series file.
MagicChunks = 0x85BD40DD
// MagicChunksSize is the size in bytes of MagicChunks.
MagicChunksSize = 4
chunksFormatV1 = 1
ChunksFormatVersionSize = 1
chunkHeaderSize = MagicChunksSize + ChunksFormatVersionSize
)
// Meta holds information about a chunk of data.
type Meta struct {
// Ref and Chunk hold either a reference that can be used to retrieve
// chunk data or the data itself.
// Generally, only one of them is set.
Ref uint64
Chunk chunkenc.Chunk
// Time range the data covers.
// When MaxTime == math.MaxInt64 the chunk is still open and being appended to.
MinTime, MaxTime int64
}
// writeHash writes the chunk encoding and raw data into the provided hash.
func (cm *Meta) writeHash(h hash.Hash, buf []byte) error {
buf = append(buf[:0], byte(cm.Chunk.Encoding()))
if _, err := h.Write(buf[:1]); err != nil {
return err
}
if _, err := h.Write(cm.Chunk.Bytes()); err != nil {
return err
}
return nil
}
// OverlapsClosedInterval Returns true if the chunk overlaps [mint, maxt].
func (cm *Meta) OverlapsClosedInterval(mint, maxt int64) bool {
// The chunk itself is a closed interval [cm.MinTime, cm.MaxTime].
return cm.MinTime <= maxt && mint <= cm.MaxTime
}
var (
errInvalidSize = fmt.Errorf("invalid size")
)
var castagnoliTable *crc32.Table
func init() {
castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
}
// newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the
// polynomial may be easily changed in one location at a later time, if necessary.
func newCRC32() hash.Hash32 {
return crc32.New(castagnoliTable)
}
// Writer implements the ChunkWriter interface for the standard
// serialization format.
type Writer struct {
dirFile *os.File
files []*os.File
wbuf *bufio.Writer
n int64
crc32 hash.Hash
buf [binary.MaxVarintLen32]byte
segmentSize int64
}
const (
defaultChunkSegmentSize = 512 * 1024 * 1024
)
// NewWriter returns a new writer against the given directory.
func NewWriter(dir string) (*Writer, error) {
if err := os.MkdirAll(dir, 0777); err != nil {
return nil, err
}
dirFile, err := fileutil.OpenDir(dir)
if err != nil {
return nil, err
}
cw := &Writer{
dirFile: dirFile,
n: 0,
crc32: newCRC32(),
segmentSize: defaultChunkSegmentSize,
}
return cw, nil
}
func (w *Writer) tail() *os.File {
if len(w.files) == 0 {
return nil
}
return w.files[len(w.files)-1]
}
// finalizeTail writes all pending data to the current tail file,
// truncates its size, and closes it.
func (w *Writer) finalizeTail() error {
tf := w.tail()
if tf == nil {
return nil
}
if err := w.wbuf.Flush(); err != nil {
return err
}
if err := tf.Sync(); err != nil {
return err
}
// As the file was pre-allocated, we truncate any superfluous zero bytes.
off, err := tf.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
if err := tf.Truncate(off); err != nil {
return err
}
return tf.Close()
}
func (w *Writer) cut() error {
// Sync current tail to disk and close.
if err := w.finalizeTail(); err != nil {
return err
}
p, _, err := nextSequenceFile(w.dirFile.Name())
if err != nil {
return err
}
f, err := os.OpenFile(p, os.O_WRONLY|os.O_CREATE, 0666)
if err != nil {
return err
}
if err = fileutil.Preallocate(f, w.segmentSize, true); err != nil {
return err
}
if err = w.dirFile.Sync(); err != nil {
return err
}
// Write header metadata for new file.
metab := make([]byte, 8)
binary.BigEndian.PutUint32(metab[:MagicChunksSize], MagicChunks)
metab[4] = chunksFormatV1
if _, err := f.Write(metab); err != nil {
return err
}
w.files = append(w.files, f)
if w.wbuf != nil {
w.wbuf.Reset(f)
} else {
w.wbuf = bufio.NewWriterSize(f, 8*1024*1024)
}
w.n = 8
return nil
}
func (w *Writer) write(b []byte) error {
n, err := w.wbuf.Write(b)
w.n += int64(n)
return err
}
// MergeOverlappingChunks removes the samples whose timestamp is overlapping.
// The last appearing sample is retained in case there is overlapping.
// This assumes that `chks []Meta` is sorted w.r.t. MinTime.
func MergeOverlappingChunks(chks []Meta) ([]Meta, error) {
if len(chks) < 2 {
return chks, nil
}
newChks := make([]Meta, 0, len(chks)) // Will contain the merged chunks.
newChks = append(newChks, chks[0])
last := 0
for _, c := range chks[1:] {
// We need to check only the last chunk in newChks.
// Reason: (1) newChks[last-1].MaxTime < newChks[last].MinTime (non overlapping)
// (2) As chks are sorted w.r.t. MinTime, newChks[last].MinTime < c.MinTime.
// So never overlaps with newChks[last-1] or anything before that.
if c.MinTime > newChks[last].MaxTime {
newChks = append(newChks, c)
last++
continue
}
nc := &newChks[last]
if c.MaxTime > nc.MaxTime {
nc.MaxTime = c.MaxTime
}
chk, err := MergeChunks(nc.Chunk, c.Chunk)
if err != nil {
return nil, err
}
nc.Chunk = chk
}
return newChks, nil
}
// MergeChunks vertically merges a and b, i.e., if there is any sample
// with same timestamp in both a and b, the sample in a is discarded.
func MergeChunks(a, b chunkenc.Chunk) (*chunkenc.XORChunk, error) {
newChunk := chunkenc.NewXORChunk()
app, err := newChunk.Appender()
if err != nil {
return nil, err
}
ait := a.Iterator(nil)
bit := b.Iterator(nil)
aok, bok := ait.Next(), bit.Next()
for aok && bok {
at, av := ait.At()
bt, bv := bit.At()
if at < bt {
app.Append(at, av)
aok = ait.Next()
} else if bt < at {
app.Append(bt, bv)
bok = bit.Next()
} else {
app.Append(bt, bv)
aok = ait.Next()
bok = bit.Next()
}
}
for aok {
at, av := ait.At()
app.Append(at, av)
aok = ait.Next()
}
for bok {
bt, bv := bit.At()
app.Append(bt, bv)
bok = bit.Next()
}
if ait.Err() != nil {
return nil, ait.Err()
}
if bit.Err() != nil {
return nil, bit.Err()
}
return newChunk, nil
}
func (w *Writer) WriteChunks(chks ...Meta) error {
// Calculate maximum space we need and cut a new segment in case
// we don't fit into the current one.
maxLen := int64(binary.MaxVarintLen32) // The number of chunks.
for _, c := range chks {
maxLen += binary.MaxVarintLen32 + 1 // The number of bytes in the chunk and its encoding.
maxLen += int64(len(c.Chunk.Bytes()))
maxLen += 4 // The 4 bytes of crc32
}
newsz := w.n + maxLen
if w.wbuf == nil || newsz > w.segmentSize && maxLen <= w.segmentSize {
if err := w.cut(); err != nil {
return err
}
}
var seq = uint64(w.seq()) << 32
for i := range chks {
chk := &chks[i]
chk.Ref = seq | uint64(w.n)
n := binary.PutUvarint(w.buf[:], uint64(len(chk.Chunk.Bytes())))
if err := w.write(w.buf[:n]); err != nil {
return err
}
w.buf[0] = byte(chk.Chunk.Encoding())
if err := w.write(w.buf[:1]); err != nil {
return err
}
if err := w.write(chk.Chunk.Bytes()); err != nil {
return err
}
w.crc32.Reset()
if err := chk.writeHash(w.crc32, w.buf[:]); err != nil {
return err
}
if err := w.write(w.crc32.Sum(w.buf[:0])); err != nil {
return err
}
}
return nil
}
func (w *Writer) seq() int {
return len(w.files) - 1
}
func (w *Writer) Close() error {
if err := w.finalizeTail(); err != nil {
return err
}
// close dir file (if not windows platform will fail on rename)
return w.dirFile.Close()
}
// ByteSlice abstracts a byte slice.
type ByteSlice interface {
Len() int
Range(start, end int) []byte
}
type realByteSlice []byte
func (b realByteSlice) Len() int {
return len(b)
}
func (b realByteSlice) Range(start, end int) []byte {
return b[start:end]
}
func (b realByteSlice) Sub(start, end int) ByteSlice {
return b[start:end]
}
// Reader implements a ChunkReader for a serialized byte stream
// of series data.
type Reader struct {
bs []ByteSlice // The underlying bytes holding the encoded series data.
cs []io.Closer // Closers for resources behind the byte slices.
size int64 // The total size of bytes in the reader.
pool chunkenc.Pool
}
func newReader(bs []ByteSlice, cs []io.Closer, pool chunkenc.Pool) (*Reader, error) {
cr := Reader{pool: pool, bs: bs, cs: cs}
var totalSize int64
for i, b := range cr.bs {
if b.Len() < chunkHeaderSize {
return nil, errors.Wrapf(errInvalidSize, "invalid chunk header in segment %d", i)
}
// Verify magic number.
if m := binary.BigEndian.Uint32(b.Range(0, MagicChunksSize)); m != MagicChunks {
return nil, errors.Errorf("invalid magic number %x", m)
}
// Verify chunk format version.
if v := int(b.Range(MagicChunksSize, MagicChunksSize+ChunksFormatVersionSize)[0]); v != chunksFormatV1 {
return nil, errors.Errorf("invalid chunk format version %d", v)
}
totalSize += int64(b.Len())
}
cr.size = totalSize
return &cr, nil
}
// NewDirReader returns a new Reader against sequentially numbered files in the
// given directory.
func NewDirReader(dir string, pool chunkenc.Pool) (*Reader, error) {
files, err := sequenceFiles(dir)
if err != nil {
return nil, err
}
if pool == nil {
pool = chunkenc.NewPool()
}
var (
bs []ByteSlice
cs []io.Closer
merr tsdb_errors.MultiError
)
for _, fn := range files {
f, err := fileutil.OpenMmapFile(fn)
if err != nil {
merr.Add(errors.Wrap(err, "mmap files"))
merr.Add(closeAll(cs))
return nil, merr
}
cs = append(cs, f)
bs = append(bs, realByteSlice(f.Bytes()))
}
reader, err := newReader(bs, cs, pool)
if err != nil {
merr.Add(err)
merr.Add(closeAll(cs))
return nil, merr
}
return reader, nil
}
func (s *Reader) Close() error {
return closeAll(s.cs)
}
// Size returns the size of the chunks.
func (s *Reader) Size() int64 {
return s.size
}
// Chunk returns a chunk from a given reference.
func (s *Reader) Chunk(ref uint64) (chunkenc.Chunk, error) {
var (
sgmSeq = int(ref >> 32)
sgmOffset = int((ref << 32) >> 32)
)
if sgmSeq >= len(s.bs) {
return nil, errors.Errorf("reference sequence %d out of range", sgmSeq)
}
chkS := s.bs[sgmSeq]
if sgmOffset >= chkS.Len() {
return nil, errors.Errorf("offset %d beyond data size %d", sgmOffset, chkS.Len())
}
// With the minimum chunk length this should never cause us reading
// over the end of the slice.
chk := chkS.Range(sgmOffset, sgmOffset+binary.MaxVarintLen32)
chkLen, n := binary.Uvarint(chk)
if n <= 0 {
return nil, errors.Errorf("reading chunk length failed with %d", n)
}
chk = chkS.Range(sgmOffset+n, sgmOffset+n+1+int(chkLen))
return s.pool.Get(chunkenc.Encoding(chk[0]), chk[1:1+chkLen])
}
func nextSequenceFile(dir string) (string, int, error) {
names, err := fileutil.ReadDir(dir)
if err != nil {
return "", 0, err
}
i := uint64(0)
for _, n := range names {
j, err := strconv.ParseUint(n, 10, 64)
if err != nil {
continue
}
i = j
}
return filepath.Join(dir, fmt.Sprintf("%0.6d", i+1)), int(i + 1), nil
}
func sequenceFiles(dir string) ([]string, error) {
files, err := ioutil.ReadDir(dir)
if err != nil {
return nil, err
}
var res []string
for _, fi := range files {
if _, err := strconv.ParseUint(fi.Name(), 10, 64); err != nil {
continue
}
res = append(res, filepath.Join(dir, fi.Name()))
}
return res, nil
}
func closeAll(cs []io.Closer) error {
var merr tsdb_errors.MultiError
for _, c := range cs {
merr.Add(c.Close())
}
return merr.Err()
}

View file

@ -0,0 +1,28 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package chunks
import (
"testing"
"github.com/prometheus/tsdb/testutil"
)
func TestReaderWithInvalidBuffer(t *testing.T) {
b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81})
r := &Reader{bs: []ByteSlice{b}}
_, err := r.Chunk(0)
testutil.NotOk(t, err)
}

3
tsdb/cmd/tsdb/.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
testdata*
tsdb
benchout

3
tsdb/cmd/tsdb/README.md Normal file
View file

@ -0,0 +1,3 @@
TODO:
- [ ] add tabular output
- [ ] break commands in separate files

653
tsdb/cmd/tsdb/main.go Normal file
View file

@ -0,0 +1,653 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"bufio"
"fmt"
"io"
"io/ioutil"
"math"
"os"
"path/filepath"
"runtime"
"runtime/pprof"
"sort"
"strconv"
"strings"
"sync"
"text/tabwriter"
"time"
"github.com/go-kit/kit/log"
"github.com/pkg/errors"
"github.com/prometheus/tsdb"
"github.com/prometheus/tsdb/chunks"
tsdb_errors "github.com/prometheus/tsdb/errors"
"github.com/prometheus/tsdb/labels"
"gopkg.in/alecthomas/kingpin.v2"
)
func main() {
if err := execute(); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
func execute() (err error) {
var (
defaultDBPath = filepath.Join("benchout", "storage")
cli = kingpin.New(filepath.Base(os.Args[0]), "CLI tool for tsdb")
benchCmd = cli.Command("bench", "run benchmarks")
benchWriteCmd = benchCmd.Command("write", "run a write performance benchmark")
benchWriteOutPath = benchWriteCmd.Flag("out", "set the output path").Default("benchout").String()
benchWriteNumMetrics = benchWriteCmd.Flag("metrics", "number of metrics to read").Default("10000").Int()
benchSamplesFile = benchWriteCmd.Arg("file", "input file with samples data, default is ("+filepath.Join("..", "..", "testdata", "20kseries.json")+")").Default(filepath.Join("..", "..", "testdata", "20kseries.json")).String()
listCmd = cli.Command("ls", "list db blocks")
listCmdHumanReadable = listCmd.Flag("human-readable", "print human readable values").Short('h').Bool()
listPath = listCmd.Arg("db path", "database path (default is "+defaultDBPath+")").Default(defaultDBPath).String()
analyzeCmd = cli.Command("analyze", "analyze churn, label pair cardinality.")
analyzePath = analyzeCmd.Arg("db path", "database path (default is "+defaultDBPath+")").Default(defaultDBPath).String()
analyzeBlockID = analyzeCmd.Arg("block id", "block to analyze (default is the last block)").String()
analyzeLimit = analyzeCmd.Flag("limit", "how many items to show in each list").Default("20").Int()
dumpCmd = cli.Command("dump", "dump samples from a TSDB")
dumpPath = dumpCmd.Arg("db path", "database path (default is "+defaultDBPath+")").Default(defaultDBPath).String()
dumpMinTime = dumpCmd.Flag("min-time", "minimum timestamp to dump").Default(strconv.FormatInt(math.MinInt64, 10)).Int64()
dumpMaxTime = dumpCmd.Flag("max-time", "maximum timestamp to dump").Default(strconv.FormatInt(math.MaxInt64, 10)).Int64()
)
logger := log.NewLogfmtLogger(log.NewSyncWriter(os.Stderr))
var merr tsdb_errors.MultiError
switch kingpin.MustParse(cli.Parse(os.Args[1:])) {
case benchWriteCmd.FullCommand():
wb := &writeBenchmark{
outPath: *benchWriteOutPath,
numMetrics: *benchWriteNumMetrics,
samplesFile: *benchSamplesFile,
logger: logger,
}
return wb.run()
case listCmd.FullCommand():
db, err := tsdb.OpenDBReadOnly(*listPath, nil)
if err != nil {
return err
}
defer func() {
merr.Add(err)
merr.Add(db.Close())
err = merr.Err()
}()
blocks, err := db.Blocks()
if err != nil {
return err
}
printBlocks(blocks, listCmdHumanReadable)
case analyzeCmd.FullCommand():
db, err := tsdb.OpenDBReadOnly(*analyzePath, nil)
if err != nil {
return err
}
defer func() {
merr.Add(err)
merr.Add(db.Close())
err = merr.Err()
}()
blocks, err := db.Blocks()
if err != nil {
return err
}
var block tsdb.BlockReader
if *analyzeBlockID != "" {
for _, b := range blocks {
if b.Meta().ULID.String() == *analyzeBlockID {
block = b
break
}
}
} else if len(blocks) > 0 {
block = blocks[len(blocks)-1]
}
if block == nil {
return fmt.Errorf("block not found")
}
return analyzeBlock(block, *analyzeLimit)
case dumpCmd.FullCommand():
db, err := tsdb.OpenDBReadOnly(*dumpPath, nil)
if err != nil {
return err
}
defer func() {
merr.Add(err)
merr.Add(db.Close())
err = merr.Err()
}()
return dumpSamples(db, *dumpMinTime, *dumpMaxTime)
}
return nil
}
type writeBenchmark struct {
outPath string
samplesFile string
cleanup bool
numMetrics int
storage *tsdb.DB
cpuprof *os.File
memprof *os.File
blockprof *os.File
mtxprof *os.File
logger log.Logger
}
func (b *writeBenchmark) run() error {
if b.outPath == "" {
dir, err := ioutil.TempDir("", "tsdb_bench")
if err != nil {
return err
}
b.outPath = dir
b.cleanup = true
}
if err := os.RemoveAll(b.outPath); err != nil {
return err
}
if err := os.MkdirAll(b.outPath, 0777); err != nil {
return err
}
dir := filepath.Join(b.outPath, "storage")
l := log.With(b.logger, "ts", log.DefaultTimestampUTC, "caller", log.DefaultCaller)
st, err := tsdb.Open(dir, l, nil, &tsdb.Options{
RetentionDuration: 15 * 24 * 60 * 60 * 1000, // 15 days in milliseconds
BlockRanges: tsdb.ExponentialBlockRanges(2*60*60*1000, 5, 3),
})
if err != nil {
return err
}
b.storage = st
var labels []labels.Labels
_, err = measureTime("readData", func() error {
f, err := os.Open(b.samplesFile)
if err != nil {
return err
}
defer f.Close()
labels, err = readPrometheusLabels(f, b.numMetrics)
if err != nil {
return err
}
return nil
})
if err != nil {
return err
}
var total uint64
dur, err := measureTime("ingestScrapes", func() error {
b.startProfiling()
total, err = b.ingestScrapes(labels, 3000)
if err != nil {
return err
}
return nil
})
if err != nil {
return err
}
fmt.Println(" > total samples:", total)
fmt.Println(" > samples/sec:", float64(total)/dur.Seconds())
_, err = measureTime("stopStorage", func() error {
if err := b.storage.Close(); err != nil {
return err
}
if err := b.stopProfiling(); err != nil {
return err
}
return nil
})
if err != nil {
return err
}
return nil
}
const timeDelta = 30000
func (b *writeBenchmark) ingestScrapes(lbls []labels.Labels, scrapeCount int) (uint64, error) {
var mu sync.Mutex
var total uint64
for i := 0; i < scrapeCount; i += 100 {
var wg sync.WaitGroup
lbls := lbls
for len(lbls) > 0 {
l := 1000
if len(lbls) < 1000 {
l = len(lbls)
}
batch := lbls[:l]
lbls = lbls[l:]
wg.Add(1)
go func() {
n, err := b.ingestScrapesShard(batch, 100, int64(timeDelta*i))
if err != nil {
// exitWithError(err)
fmt.Println(" err", err)
}
mu.Lock()
total += n
mu.Unlock()
wg.Done()
}()
}
wg.Wait()
}
fmt.Println("ingestion completed")
return total, nil
}
func (b *writeBenchmark) ingestScrapesShard(lbls []labels.Labels, scrapeCount int, baset int64) (uint64, error) {
ts := baset
type sample struct {
labels labels.Labels
value int64
ref *uint64
}
scrape := make([]*sample, 0, len(lbls))
for _, m := range lbls {
scrape = append(scrape, &sample{
labels: m,
value: 123456789,
})
}
total := uint64(0)
for i := 0; i < scrapeCount; i++ {
app := b.storage.Appender()
ts += timeDelta
for _, s := range scrape {
s.value += 1000
if s.ref == nil {
ref, err := app.Add(s.labels, ts, float64(s.value))
if err != nil {
panic(err)
}
s.ref = &ref
} else if err := app.AddFast(*s.ref, ts, float64(s.value)); err != nil {
if errors.Cause(err) != tsdb.ErrNotFound {
panic(err)
}
ref, err := app.Add(s.labels, ts, float64(s.value))
if err != nil {
panic(err)
}
s.ref = &ref
}
total++
}
if err := app.Commit(); err != nil {
return total, err
}
}
return total, nil
}
func (b *writeBenchmark) startProfiling() error {
var err error
// Start CPU profiling.
b.cpuprof, err = os.Create(filepath.Join(b.outPath, "cpu.prof"))
if err != nil {
return fmt.Errorf("bench: could not create cpu profile: %v", err)
}
if err := pprof.StartCPUProfile(b.cpuprof); err != nil {
return fmt.Errorf("bench: could not start CPU profile: %v", err)
}
// Start memory profiling.
b.memprof, err = os.Create(filepath.Join(b.outPath, "mem.prof"))
if err != nil {
return fmt.Errorf("bench: could not create memory profile: %v", err)
}
runtime.MemProfileRate = 64 * 1024
// Start fatal profiling.
b.blockprof, err = os.Create(filepath.Join(b.outPath, "block.prof"))
if err != nil {
return fmt.Errorf("bench: could not create block profile: %v", err)
}
runtime.SetBlockProfileRate(20)
b.mtxprof, err = os.Create(filepath.Join(b.outPath, "mutex.prof"))
if err != nil {
return fmt.Errorf("bench: could not create mutex profile: %v", err)
}
runtime.SetMutexProfileFraction(20)
return nil
}
func (b *writeBenchmark) stopProfiling() error {
if b.cpuprof != nil {
pprof.StopCPUProfile()
b.cpuprof.Close()
b.cpuprof = nil
}
if b.memprof != nil {
if err := pprof.Lookup("heap").WriteTo(b.memprof, 0); err != nil {
return fmt.Errorf("error writing mem profile: %v", err)
}
b.memprof.Close()
b.memprof = nil
}
if b.blockprof != nil {
if err := pprof.Lookup("block").WriteTo(b.blockprof, 0); err != nil {
return fmt.Errorf("error writing block profile: %v", err)
}
b.blockprof.Close()
b.blockprof = nil
runtime.SetBlockProfileRate(0)
}
if b.mtxprof != nil {
if err := pprof.Lookup("mutex").WriteTo(b.mtxprof, 0); err != nil {
return fmt.Errorf("error writing mutex profile: %v", err)
}
b.mtxprof.Close()
b.mtxprof = nil
runtime.SetMutexProfileFraction(0)
}
return nil
}
func measureTime(stage string, f func() error) (time.Duration, error) {
fmt.Printf(">> start stage=%s\n", stage)
start := time.Now()
err := f()
if err != nil {
return 0, err
}
fmt.Printf(">> completed stage=%s duration=%s\n", stage, time.Since(start))
return time.Since(start), nil
}
func readPrometheusLabels(r io.Reader, n int) ([]labels.Labels, error) {
scanner := bufio.NewScanner(r)
var mets []labels.Labels
hashes := map[uint64]struct{}{}
i := 0
for scanner.Scan() && i < n {
m := make(labels.Labels, 0, 10)
r := strings.NewReplacer("\"", "", "{", "", "}", "")
s := r.Replace(scanner.Text())
labelChunks := strings.Split(s, ",")
for _, labelChunk := range labelChunks {
split := strings.Split(labelChunk, ":")
m = append(m, labels.Label{Name: split[0], Value: split[1]})
}
// Order of the k/v labels matters, don't assume we'll always receive them already sorted.
sort.Sort(m)
h := m.Hash()
if _, ok := hashes[h]; ok {
continue
}
mets = append(mets, m)
hashes[h] = struct{}{}
i++
}
return mets, nil
}
func printBlocks(blocks []tsdb.BlockReader, humanReadable *bool) {
tw := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
defer tw.Flush()
fmt.Fprintln(tw, "BLOCK ULID\tMIN TIME\tMAX TIME\tNUM SAMPLES\tNUM CHUNKS\tNUM SERIES")
for _, b := range blocks {
meta := b.Meta()
fmt.Fprintf(tw,
"%v\t%v\t%v\t%v\t%v\t%v\n",
meta.ULID,
getFormatedTime(meta.MinTime, humanReadable),
getFormatedTime(meta.MaxTime, humanReadable),
meta.Stats.NumSamples,
meta.Stats.NumChunks,
meta.Stats.NumSeries,
)
}
}
func getFormatedTime(timestamp int64, humanReadable *bool) string {
if *humanReadable {
return time.Unix(timestamp/1000, 0).String()
}
return strconv.FormatInt(timestamp, 10)
}
func analyzeBlock(b tsdb.BlockReader, limit int) error {
meta := b.Meta()
fmt.Printf("Block ID: %s\n", meta.ULID)
// Presume 1ms resolution that Prometheus uses.
fmt.Printf("Duration: %s\n", (time.Duration(meta.MaxTime-meta.MinTime) * 1e6).String())
fmt.Printf("Series: %d\n", meta.Stats.NumSeries)
ir, err := b.Index()
if err != nil {
return err
}
defer ir.Close()
allLabelNames, err := ir.LabelNames()
if err != nil {
return err
}
fmt.Printf("Label names: %d\n", len(allLabelNames))
type postingInfo struct {
key string
metric uint64
}
postingInfos := []postingInfo{}
printInfo := func(postingInfos []postingInfo) {
sort.Slice(postingInfos, func(i, j int) bool { return postingInfos[i].metric > postingInfos[j].metric })
for i, pc := range postingInfos {
fmt.Printf("%d %s\n", pc.metric, pc.key)
if i >= limit {
break
}
}
}
labelsUncovered := map[string]uint64{}
labelpairsUncovered := map[string]uint64{}
labelpairsCount := map[string]uint64{}
entries := 0
p, err := ir.Postings("", "") // The special all key.
if err != nil {
return err
}
lbls := labels.Labels{}
chks := []chunks.Meta{}
for p.Next() {
if err = ir.Series(p.At(), &lbls, &chks); err != nil {
return err
}
// Amount of the block time range not covered by this series.
uncovered := uint64(meta.MaxTime-meta.MinTime) - uint64(chks[len(chks)-1].MaxTime-chks[0].MinTime)
for _, lbl := range lbls {
key := lbl.Name + "=" + lbl.Value
labelsUncovered[lbl.Name] += uncovered
labelpairsUncovered[key] += uncovered
labelpairsCount[key]++
entries++
}
}
if p.Err() != nil {
return p.Err()
}
fmt.Printf("Postings (unique label pairs): %d\n", len(labelpairsUncovered))
fmt.Printf("Postings entries (total label pairs): %d\n", entries)
postingInfos = postingInfos[:0]
for k, m := range labelpairsUncovered {
postingInfos = append(postingInfos, postingInfo{k, uint64(float64(m) / float64(meta.MaxTime-meta.MinTime))})
}
fmt.Printf("\nLabel pairs most involved in churning:\n")
printInfo(postingInfos)
postingInfos = postingInfos[:0]
for k, m := range labelsUncovered {
postingInfos = append(postingInfos, postingInfo{k, uint64(float64(m) / float64(meta.MaxTime-meta.MinTime))})
}
fmt.Printf("\nLabel names most involved in churning:\n")
printInfo(postingInfos)
postingInfos = postingInfos[:0]
for k, m := range labelpairsCount {
postingInfos = append(postingInfos, postingInfo{k, m})
}
fmt.Printf("\nMost common label pairs:\n")
printInfo(postingInfos)
postingInfos = postingInfos[:0]
for _, n := range allLabelNames {
values, err := ir.LabelValues(n)
if err != nil {
return err
}
var cumulativeLength uint64
for i := 0; i < values.Len(); i++ {
value, _ := values.At(i)
if err != nil {
return err
}
for _, str := range value {
cumulativeLength += uint64(len(str))
}
}
postingInfos = append(postingInfos, postingInfo{n, cumulativeLength})
}
fmt.Printf("\nLabel names with highest cumulative label value length:\n")
printInfo(postingInfos)
postingInfos = postingInfos[:0]
for _, n := range allLabelNames {
lv, err := ir.LabelValues(n)
if err != nil {
return err
}
postingInfos = append(postingInfos, postingInfo{n, uint64(lv.Len())})
}
fmt.Printf("\nHighest cardinality labels:\n")
printInfo(postingInfos)
postingInfos = postingInfos[:0]
lv, err := ir.LabelValues("__name__")
if err != nil {
return err
}
for i := 0; i < lv.Len(); i++ {
names, err := lv.At(i)
if err != nil {
return err
}
for _, n := range names {
postings, err := ir.Postings("__name__", n)
if err != nil {
return err
}
count := 0
for postings.Next() {
count++
}
if postings.Err() != nil {
return postings.Err()
}
postingInfos = append(postingInfos, postingInfo{n, uint64(count)})
}
}
fmt.Printf("\nHighest cardinality metric names:\n")
printInfo(postingInfos)
return nil
}
func dumpSamples(db *tsdb.DBReadOnly, mint, maxt int64) (err error) {
q, err := db.Querier(mint, maxt)
if err != nil {
return err
}
defer func() {
var merr tsdb_errors.MultiError
merr.Add(err)
merr.Add(q.Close())
err = merr.Err()
}()
ss, err := q.Select(labels.NewMustRegexpMatcher("", ".*"))
if err != nil {
return err
}
for ss.Next() {
series := ss.At()
labels := series.Labels()
it := series.Iterator()
for it.Next() {
ts, val := it.At()
fmt.Printf("%s %g %d\n", labels, val, ts)
}
if it.Err() != nil {
return ss.Err()
}
}
if ss.Err() != nil {
return ss.Err()
}
return nil
}

1034
tsdb/compact.go Normal file

File diff suppressed because it is too large Load diff

1059
tsdb/compact_test.go Normal file

File diff suppressed because it is too large Load diff

1335
tsdb/db.go Normal file

File diff suppressed because it is too large Load diff

2361
tsdb/db_test.go Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,6 @@
## TSDB format
* [Index](index.md)
* [Chunks](chunks.md)
* [Tombstones](tombstones.md)
* [Wal](wal.md)

View file

@ -0,0 +1,31 @@
# Chunks Disk Format
The following describes the format of a chunks file,
which is created in the `chunks/` directory of a block.
The maximum size per segment file is 512MiB.
Chunks in the files are referenced from the index by uint64 composed of
in-file offset (lower 4 bytes) and segment sequence number (upper 4 bytes).
```
┌────────────────────────────┬─────────────────────┐
│ magic(0x85BD40DD) <4 byte> │ version(1) <1 byte>
├────────────────────────────┴─────────────────────┤
│ ┌──────────────────────────────────────────────┐ │
│ │ Chunk 1 │ │
│ ├──────────────────────────────────────────────┤ │
│ │ ... │ │
│ ├──────────────────────────────────────────────┤ │
│ │ Chunk N │ │
│ └──────────────────────────────────────────────┘ │
└──────────────────────────────────────────────────┘
```
# Chunk
```
┌───────────────┬───────────────────┬──────────────┬────────────────┐
│ len <uvarint> │ encoding <1 byte> │ data <bytes> │ CRC32 <4 byte>
└───────────────┴───────────────────┴──────────────┴────────────────┘
```

251
tsdb/docs/format/index.md Normal file
View file

@ -0,0 +1,251 @@
# Index Disk Format
The following describes the format of the `index` file found in each block directory.
It is terminated by a table of contents which serves as an entry point into the index.
```
┌────────────────────────────┬─────────────────────┐
│ magic(0xBAAAD700) <4b> │ version(1) <1 byte>
├────────────────────────────┴─────────────────────┤
│ ┌──────────────────────────────────────────────┐ │
│ │ Symbol Table │ │
│ ├──────────────────────────────────────────────┤ │
│ │ Series │ │
│ ├──────────────────────────────────────────────┤ │
│ │ Label Index 1 │ │
│ ├──────────────────────────────────────────────┤ │
│ │ ... │ │
│ ├──────────────────────────────────────────────┤ │
│ │ Label Index N │ │
│ ├──────────────────────────────────────────────┤ │
│ │ Postings 1 │ │
│ ├──────────────────────────────────────────────┤ │
│ │ ... │ │
│ ├──────────────────────────────────────────────┤ │
│ │ Postings N │ │
│ ├──────────────────────────────────────────────┤ │
│ │ Label Index Table │ │
│ ├──────────────────────────────────────────────┤ │
│ │ Postings Table │ │
│ ├──────────────────────────────────────────────┤ │
│ │ TOC │ │
│ └──────────────────────────────────────────────┘ │
└──────────────────────────────────────────────────┘
```
When the index is written, an arbitrary number of padding bytes may be added between the lined out main sections above. When sequentially scanning through the file, any zero bytes after a section's specified length must be skipped.
Most of the sections described below start with a `len` field. It always specifies the number of bytes just before the trailing CRC32 checksum. The checksum is always calculated over those `len` bytes.
### Symbol Table
The symbol table holds a sorted list of deduplicated strings that occurred in label pairs of the stored series. They can be referenced from subsequent sections and significantly reduce the total index size.
The section contains a sequence of the string entries, each prefixed with the string's length in raw bytes. All strings are utf-8 encoded.
Strings are referenced by sequential indexing. The strings are sorted in lexicographically ascending order.
```
┌────────────────────┬─────────────────────┐
│ len <4b>#symbols <4b>
├────────────────────┴─────────────────────┤
│ ┌──────────────────────┬───────────────┐ │
│ │ len(str_1) <uvarint> │ str_1 <bytes> │ │
│ ├──────────────────────┴───────────────┤ │
│ │ . . . │ │
│ ├──────────────────────┬───────────────┤ │
│ │ len(str_n) <uvarint> │ str_n <bytes> │ │
│ └──────────────────────┴───────────────┘ │
├──────────────────────────────────────────┤
│ CRC32 <4b>
└──────────────────────────────────────────┘
```
### Series
The section contains a sequence of series that hold the label set of the series as well as its chunks within the block. The series are sorted lexicographically by their label sets.
Each series section is aligned to 16 bytes. The ID for a series is the `offset/16`. This serves as the series' ID in all subsequent references. Thereby, a sorted list of series IDs implies a lexicographically sorted list of series label sets.
```
┌───────────────────────────────────────┐
│ ┌───────────────────────────────────┐ │
│ │ series_1 │ │
│ ├───────────────────────────────────┤ │
│ │ . . . │ │
│ ├───────────────────────────────────┤ │
│ │ series_n │ │
│ └───────────────────────────────────┘ │
└───────────────────────────────────────┘
```
Every series entry first holds its number of labels, followed by tuples of symbol table references that contain the label name and value. The label pairs are lexicographically sorted.
After the labels, the number of indexed chunks is encoded, followed by a sequence of metadata entries containing the chunks minimum (`mint`) and maximum (`maxt`) timestamp and a reference to its position in the chunk file. The `mint` is the time of the first sample and `maxt` is the time of the last sample in the chunk. Holding the time range data in the index allows dropping chunks irrelevant to queried time ranges without accessing them directly.
`mint` of the first chunk is stored, it's `maxt` is stored as a delta and the `mint` and `maxt` are encoded as deltas to the previous time for subsequent chunks. Similarly, the reference of the first chunk is stored and the next ref is stored as a delta to the previous one.
```
┌──────────────────────────────────────────────────────────────────────────┐
│ len <uvarint>
├──────────────────────────────────────────────────────────────────────────┤
│ ┌──────────────────────────────────────────────────────────────────────┐ │
│ │ labels count <uvarint64> │ │
│ ├──────────────────────────────────────────────────────────────────────┤ │
│ │ ┌────────────────────────────────────────────┐ │ │
│ │ │ ref(l_i.name) <uvarint32> │ │ │
│ │ ├────────────────────────────────────────────┤ │ │
│ │ │ ref(l_i.value) <uvarint32> │ │ │
│ │ └────────────────────────────────────────────┘ │ │
│ │ ... │ │
│ ├──────────────────────────────────────────────────────────────────────┤ │
│ │ chunks count <uvarint64> │ │
│ ├──────────────────────────────────────────────────────────────────────┤ │
│ │ ┌────────────────────────────────────────────┐ │ │
│ │ │ c_0.mint <varint64> │ │ │
│ │ ├────────────────────────────────────────────┤ │ │
│ │ │ c_0.maxt - c_0.mint <uvarint64> │ │ │
│ │ ├────────────────────────────────────────────┤ │ │
│ │ │ ref(c_0.data) <uvarint64> │ │ │
│ │ └────────────────────────────────────────────┘ │ │
│ │ ┌────────────────────────────────────────────┐ │ │
│ │ │ c_i.mint - c_i-1.maxt <uvarint64> │ │ │
│ │ ├────────────────────────────────────────────┤ │ │
│ │ │ c_i.maxt - c_i.mint <uvarint64> │ │ │
│ │ ├────────────────────────────────────────────┤ │ │
│ │ │ ref(c_i.data) - ref(c_i-1.data) <varint64> │ │ │
│ │ └────────────────────────────────────────────┘ │ │
│ │ ... │ │
│ └──────────────────────────────────────────────────────────────────────┘ │
├──────────────────────────────────────────────────────────────────────────┤
│ CRC32 <4b>
└──────────────────────────────────────────────────────────────────────────┘
```
### Label Index
A label index section indexes the existing (combined) values for one or more label names.
The `#names` field determines the number of indexed label names, followed by the total number of entries in the `#entries` field. The body holds #entries / #names tuples of symbol table references, each tuple being of #names length. The value tuples are sorted in lexicographically increasing order.
```
┌───────────────┬────────────────┬────────────────┐
│ len <4b>#names <4b>#entries <4b>
├───────────────┴────────────────┴────────────────┤
│ ┌─────────────────────────────────────────────┐ │
│ │ ref(value_0) <4b> │ │
│ ├─────────────────────────────────────────────┤ │
│ │ ... │ │
│ ├─────────────────────────────────────────────┤ │
│ │ ref(value_n) <4b> │ │
│ └─────────────────────────────────────────────┘ │
│ . . . │
├─────────────────────────────────────────────────┤
│ CRC32 <4b>
└─────────────────────────────────────────────────┘
```
For instance, a single label name with 4 different values will be encoded as:
```
┌────┬───┬───┬──────────────┬──────────────┬──────────────┬──────────────┬───────┐
│ 24 │ 1 │ 4 │ ref(value_0) | ref(value_1) | ref(value_2) | ref(value_3) | CRC32 |
└────┴───┴───┴──────────────┴──────────────┴──────────────┴──────────────┴───────┘
```
The sequence of label index sections is finalized by a [label offset table](#label-offset-table) containing label offset entries that points to the beginning of each label index section for a given label name.
### Postings
Postings sections store monotonically increasing lists of series references that contain a given label pair associated with the list.
```
┌────────────────────┬────────────────────┐
│ len <4b>#entries <4b>
├────────────────────┴────────────────────┤
│ ┌─────────────────────────────────────┐ │
│ │ ref(series_1) <4b> │ │
│ ├─────────────────────────────────────┤ │
│ │ ... │ │
│ ├─────────────────────────────────────┤ │
│ │ ref(series_n) <4b> │ │
│ └─────────────────────────────────────┘ │
├─────────────────────────────────────────┤
│ CRC32 <4b>
└─────────────────────────────────────────┘
```
The sequence of postings sections is finalized by a [postings offset table](#postings-offset-table) containing postings offset entries that points to the beginning of each postings section for a given label pair.
### Label Offset Table
A label offset table stores a sequence of label offset entries.
Every label offset entry holds the label name and the offset to its values in the label index section.
They are used to track label index sections. They are read into memory when an index file is loaded.
```
┌─────────────────────┬──────────────────────┐
│ len <4b>#entries <4b>
├─────────────────────┴──────────────────────┤
│ ┌────────────────────────────────────────┐ │
│ │ n = 1 <1b> │ │
│ ├──────────────────────┬─────────────────┤ │
│ │ len(name) <uvarint> │ name <bytes> │ │
│ ├──────────────────────┴─────────────────┤ │
│ │ offset <uvarint64> │ │
│ └────────────────────────────────────────┘ │
│ . . . │
├────────────────────────────────────────────┤
│ CRC32 <4b>
└────────────────────────────────────────────┘
```
### Postings Offset Table
A postings offset table stores a sequence of postings offset entries.
Every postings offset entry holds the lable name/value pair and the offset to its series list in the postings section.
They are used to track postings sections. They are read into memory when an index file is loaded.
```
┌─────────────────────┬──────────────────────┐
│ len <4b>#entries <4b>
├─────────────────────┴──────────────────────┤
│ ┌────────────────────────────────────────┐ │
│ │ n = 2 <1b> │ │
│ ├──────────────────────┬─────────────────┤ │
│ │ len(name) <uvarint> │ name <bytes> │ │
│ ├──────────────────────┼─────────────────┤ │
│ │ len(value) <uvarint> │ value <bytes> │ │
│ ├──────────────────────┴─────────────────┤ │
│ │ offset <uvarint64> │ │
│ └────────────────────────────────────────┘ │
│ . . . │
├────────────────────────────────────────────┤
│ CRC32 <4b>
└────────────────────────────────────────────┘
```
### TOC
The table of contents serves as an entry point to the entire index and points to various sections in the file.
If a reference is zero, it indicates the respective section does not exist and empty results should be returned upon lookup.
```
┌─────────────────────────────────────────┐
│ ref(symbols) <8b>
├─────────────────────────────────────────┤
│ ref(series) <8b>
├─────────────────────────────────────────┤
│ ref(label indices start) <8b>
├─────────────────────────────────────────┤
│ ref(label offset table) <8b>
├─────────────────────────────────────────┤
│ ref(postings start) <8b>
├─────────────────────────────────────────┤
│ ref(postings offset table) <8b>
├─────────────────────────────────────────┤
│ CRC32 <4b>
└─────────────────────────────────────────┘
```

View file

@ -0,0 +1,31 @@
# Tombstones Disk Format
The following describes the format of a tombstones file, which is placed
at the top level directory of a block.
The last 8 bytes specifies the offset to the start of Stones section.
The stones section is 0 padded to a multiple of 4 for fast scans.
```
┌────────────────────────────┬─────────────────────┐
│ magic(0x0130BA30) <4b> │ version(1) <1 byte>
├────────────────────────────┴─────────────────────┤
│ ┌──────────────────────────────────────────────┐ │
│ │ Tombstone 1 │ │
│ ├──────────────────────────────────────────────┤ │
│ │ ... │ │
│ ├──────────────────────────────────────────────┤ │
│ │ Tombstone N │ │
│ ├──────────────────────────────────────────────┤ │
│ │ CRC<4b> │ │
│ └──────────────────────────────────────────────┘ │
└──────────────────────────────────────────────────┘
```
# Tombstone
```
┌────────────────┬─────────────────┬────────────────┐
│ref <uvarint64> │ mint <varint64> │ maxt <varint64>
└────────────────┴─────────────────┴────────────────┘
```

88
tsdb/docs/format/wal.md Normal file
View file

@ -0,0 +1,88 @@
# WAL Disk Format
The write ahead log operates in segments that are numbered and sequential,
e.g. `000000`, `000001`, `000002`, etc., and are limited to 128MB by default.
A segment is written to in pages of 32KB. Only the last page of the most recent segment
may be partial. A WAL record is an opaque byte slice that gets split up into sub-records
should it exceed the remaining space of the current page. Records are never split across
segment boundaries. If a single record exceeds the default segment size, a segment with
a larger size will be created.
The encoding of pages is largely borrowed from [LevelDB's/RocksDB's write ahead log.](https://github.com/facebook/rocksdb/wiki/Write-Ahead-Log-File-Format)
Notable deviations are that the record fragment is encoded as:
```
┌───────────┬──────────┬────────────┬──────────────┐
│ type <1b> │ len <2b> │ CRC32 <4b> │ data <bytes>
└───────────┴──────────┴────────────┴──────────────┘
```
The type flag has the following states:
* `0`: rest of page will be empty
* `1`: a full record encoded in a single fragment
* `2`: first fragment of a record
* `3`: middle fragment of a record
* `4`: final fragment of a record
## Record encoding
The records written to the write ahead log are encoded as follows:
### Series records
Series records encode the labels that identifies a series and its unique ID.
```
┌────────────────────────────────────────────┐
│ type = 1 <1b>
├────────────────────────────────────────────┤
│ ┌─────────┬──────────────────────────────┐ │
│ │ id <8b> │ n = len(labels) <uvarint> │ │
│ ├─────────┴────────────┬─────────────────┤ │
│ │ len(str_1) <uvarint> │ str_1 <bytes> │ │
│ ├──────────────────────┴─────────────────┤ │
│ │ ... │ │
│ ├───────────────────────┬────────────────┤ │
│ │ len(str_2n) <uvarint> │ str_2n <bytes> │ │
│ └───────────────────────┴────────────────┘ │
│ . . . │
└────────────────────────────────────────────┘
```
### Sample records
Sample records encode samples as a list of triples `(series_id, timestamp, value)`.
Series reference and timestamp are encoded as deltas w.r.t the first sample.
The first row stores the starting id and the starting timestamp.
The first sample record begins at the second row.
```
┌──────────────────────────────────────────────────────────────────┐
│ type = 2 <1b>
├──────────────────────────────────────────────────────────────────┤
│ ┌────────────────────┬───────────────────────────┐ │
│ │ id <8b> │ timestamp <8b> │ │
│ └────────────────────┴───────────────────────────┘ │
│ ┌────────────────────┬───────────────────────────┬─────────────┐ │
│ │ id_delta <uvarint> │ timestamp_delta <uvarint> │ value <8b> │ │
│ └────────────────────┴───────────────────────────┴─────────────┘ │
│ . . . │
└──────────────────────────────────────────────────────────────────┘
```
### Tombstone records
Tombstone records encode tombstones as a list of triples `(series_id, min_time, max_time)`
and specify an interval for which samples of a series got deleted.
```
┌─────────────────────────────────────────────────────┐
│ type = 3 <1b>
├─────────────────────────────────────────────────────┤
│ ┌─────────┬───────────────────┬───────────────────┐ │
│ │ id <8b> │ min_time <varint> │ max_time <varint> │ │
│ └─────────┴───────────────────┴───────────────────┘ │
│ . . . │
└─────────────────────────────────────────────────────┘
```

244
tsdb/encoding/encoding.go Normal file
View file

@ -0,0 +1,244 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package encoding
import (
"encoding/binary"
"hash"
"hash/crc32"
"unsafe"
"github.com/pkg/errors"
)
var (
ErrInvalidSize = errors.New("invalid size")
ErrInvalidChecksum = errors.New("invalid checksum")
)
// Encbuf is a helper type to populate a byte slice with various types.
type Encbuf struct {
B []byte
C [binary.MaxVarintLen64]byte
}
func (e *Encbuf) Reset() { e.B = e.B[:0] }
func (e *Encbuf) Get() []byte { return e.B }
func (e *Encbuf) Len() int { return len(e.B) }
func (e *Encbuf) PutString(s string) { e.B = append(e.B, s...) }
func (e *Encbuf) PutByte(c byte) { e.B = append(e.B, c) }
func (e *Encbuf) PutBE32int(x int) { e.PutBE32(uint32(x)) }
func (e *Encbuf) PutUvarint32(x uint32) { e.PutUvarint64(uint64(x)) }
func (e *Encbuf) PutBE64int64(x int64) { e.PutBE64(uint64(x)) }
func (e *Encbuf) PutUvarint(x int) { e.PutUvarint64(uint64(x)) }
func (e *Encbuf) PutBE32(x uint32) {
binary.BigEndian.PutUint32(e.C[:], x)
e.B = append(e.B, e.C[:4]...)
}
func (e *Encbuf) PutBE64(x uint64) {
binary.BigEndian.PutUint64(e.C[:], x)
e.B = append(e.B, e.C[:8]...)
}
func (e *Encbuf) PutUvarint64(x uint64) {
n := binary.PutUvarint(e.C[:], x)
e.B = append(e.B, e.C[:n]...)
}
func (e *Encbuf) PutVarint64(x int64) {
n := binary.PutVarint(e.C[:], x)
e.B = append(e.B, e.C[:n]...)
}
// PutUvarintStr writes a string to the buffer prefixed by its varint length (in bytes!).
func (e *Encbuf) PutUvarintStr(s string) {
b := *(*[]byte)(unsafe.Pointer(&s))
e.PutUvarint(len(b))
e.PutString(s)
}
// PutHash appends a hash over the buffers current contents to the buffer.
func (e *Encbuf) PutHash(h hash.Hash) {
h.Reset()
_, err := h.Write(e.B)
if err != nil {
panic(err) // The CRC32 implementation does not error
}
e.B = h.Sum(e.B)
}
// Decbuf provides safe methods to extract data from a byte slice. It does all
// necessary bounds checking and advancing of the byte slice.
// Several datums can be extracted without checking for errors. However, before using
// any datum, the err() method must be checked.
type Decbuf struct {
B []byte
E error
}
// NewDecbufAt returns a new decoding buffer. It expects the first 4 bytes
// after offset to hold the big endian encoded content length, followed by the contents and the expected
// checksum.
func NewDecbufAt(bs ByteSlice, off int, castagnoliTable *crc32.Table) Decbuf {
if bs.Len() < off+4 {
return Decbuf{E: ErrInvalidSize}
}
b := bs.Range(off, off+4)
l := int(binary.BigEndian.Uint32(b))
if bs.Len() < off+4+l+4 {
return Decbuf{E: ErrInvalidSize}
}
// Load bytes holding the contents plus a CRC32 checksum.
b = bs.Range(off+4, off+4+l+4)
dec := Decbuf{B: b[:len(b)-4]}
if exp := binary.BigEndian.Uint32(b[len(b)-4:]); dec.Crc32(castagnoliTable) != exp {
return Decbuf{E: ErrInvalidChecksum}
}
return dec
}
// NewDecbufUvarintAt returns a new decoding buffer. It expects the first bytes
// after offset to hold the uvarint-encoded buffers length, followed by the contents and the expected
// checksum.
func NewDecbufUvarintAt(bs ByteSlice, off int, castagnoliTable *crc32.Table) Decbuf {
// We never have to access this method at the far end of the byte slice. Thus just checking
// against the MaxVarintLen32 is sufficient.
if bs.Len() < off+binary.MaxVarintLen32 {
return Decbuf{E: ErrInvalidSize}
}
b := bs.Range(off, off+binary.MaxVarintLen32)
l, n := binary.Uvarint(b)
if n <= 0 || n > binary.MaxVarintLen32 {
return Decbuf{E: errors.Errorf("invalid uvarint %d", n)}
}
if bs.Len() < off+n+int(l)+4 {
return Decbuf{E: ErrInvalidSize}
}
// Load bytes holding the contents plus a CRC32 checksum.
b = bs.Range(off+n, off+n+int(l)+4)
dec := Decbuf{B: b[:len(b)-4]}
if dec.Crc32(castagnoliTable) != binary.BigEndian.Uint32(b[len(b)-4:]) {
return Decbuf{E: ErrInvalidChecksum}
}
return dec
}
func (d *Decbuf) Uvarint() int { return int(d.Uvarint64()) }
func (d *Decbuf) Be32int() int { return int(d.Be32()) }
func (d *Decbuf) Be64int64() int64 { return int64(d.Be64()) }
// Crc32 returns a CRC32 checksum over the remaining bytes.
func (d *Decbuf) Crc32(castagnoliTable *crc32.Table) uint32 {
return crc32.Checksum(d.B, castagnoliTable)
}
func (d *Decbuf) UvarintStr() string {
l := d.Uvarint64()
if d.E != nil {
return ""
}
if len(d.B) < int(l) {
d.E = ErrInvalidSize
return ""
}
s := string(d.B[:l])
d.B = d.B[l:]
return s
}
func (d *Decbuf) Varint64() int64 {
if d.E != nil {
return 0
}
x, n := binary.Varint(d.B)
if n < 1 {
d.E = ErrInvalidSize
return 0
}
d.B = d.B[n:]
return x
}
func (d *Decbuf) Uvarint64() uint64 {
if d.E != nil {
return 0
}
x, n := binary.Uvarint(d.B)
if n < 1 {
d.E = ErrInvalidSize
return 0
}
d.B = d.B[n:]
return x
}
func (d *Decbuf) Be64() uint64 {
if d.E != nil {
return 0
}
if len(d.B) < 8 {
d.E = ErrInvalidSize
return 0
}
x := binary.BigEndian.Uint64(d.B)
d.B = d.B[8:]
return x
}
func (d *Decbuf) Be32() uint32 {
if d.E != nil {
return 0
}
if len(d.B) < 4 {
d.E = ErrInvalidSize
return 0
}
x := binary.BigEndian.Uint32(d.B)
d.B = d.B[4:]
return x
}
func (d *Decbuf) Byte() byte {
if d.E != nil {
return 0
}
if len(d.B) < 1 {
d.E = ErrInvalidSize
return 0
}
x := d.B[0]
d.B = d.B[1:]
return x
}
func (d *Decbuf) Err() error { return d.E }
func (d *Decbuf) Len() int { return len(d.B) }
func (d *Decbuf) Get() []byte { return d.B }
// ByteSlice abstracts a byte slice.
type ByteSlice interface {
Len() int
Range(start, end int) []byte
}

62
tsdb/errors/errors.go Normal file
View file

@ -0,0 +1,62 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package errors
import (
"bytes"
"fmt"
)
// The MultiError type implements the error interface, and contains the
// Errors used to construct it.
type MultiError []error
// Returns a concatenated string of the contained errors
func (es MultiError) Error() string {
var buf bytes.Buffer
if len(es) > 1 {
fmt.Fprintf(&buf, "%d errors: ", len(es))
}
for i, err := range es {
if i != 0 {
buf.WriteString("; ")
}
buf.WriteString(err.Error())
}
return buf.String()
}
// Add adds the error to the error list if it is not nil.
func (es *MultiError) Add(err error) {
if err == nil {
return
}
if merr, ok := err.(MultiError); ok {
*es = append(*es, merr...)
} else {
*es = append(*es, err)
}
}
// Err returns the error list as an error or nil if it is empty.
func (es MultiError) Err() error {
if len(es) == 0 {
return nil
}
return es
}

22
tsdb/fileutil/dir_unix.go Normal file
View file

@ -0,0 +1,22 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !windows
package fileutil
import "os"
// OpenDir opens a directory for syncing.
func OpenDir(path string) (*os.File, error) { return os.Open(path) }

View file

@ -0,0 +1,46 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build windows
package fileutil
import (
"os"
"syscall"
)
// OpenDir opens a directory in windows with write access for syncing.
func OpenDir(path string) (*os.File, error) {
fd, err := openDir(path)
if err != nil {
return nil, err
}
return os.NewFile(uintptr(fd), path), nil
}
func openDir(path string) (fd syscall.Handle, err error) {
if len(path) == 0 {
return syscall.InvalidHandle, syscall.ERROR_FILE_NOT_FOUND
}
pathp, err := syscall.UTF16PtrFromString(path)
if err != nil {
return syscall.InvalidHandle, err
}
access := uint32(syscall.GENERIC_READ | syscall.GENERIC_WRITE)
sharemode := uint32(syscall.FILE_SHARE_READ | syscall.FILE_SHARE_WRITE)
createmode := uint32(syscall.OPEN_EXISTING)
fl := uint32(syscall.FILE_FLAG_BACKUP_SEMANTICS)
return syscall.CreateFile(pathp, access, sharemode, nil, createmode, fl, 0)
}

159
tsdb/fileutil/fileutil.go Normal file
View file

@ -0,0 +1,159 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package fileutil provides utility methods used when dealing with the filesystem in tsdb.
// It is largely copied from github.com/coreos/etcd/pkg/fileutil to avoid the
// dependency chain it brings with it.
// Please check github.com/coreos/etcd for licensing information.
package fileutil
import (
"io/ioutil"
"os"
"path/filepath"
"sort"
"strings"
)
// CopyDirs copies all directories, subdirectories and files recursively including the empty folders.
// Source and destination must be full paths.
func CopyDirs(src, dest string) error {
if err := os.MkdirAll(dest, 0777); err != nil {
return err
}
files, err := readDirs(src)
if err != nil {
return err
}
for _, f := range files {
dp := filepath.Join(dest, f)
sp := filepath.Join(src, f)
stat, err := os.Stat(sp)
if err != nil {
return err
}
// Empty directories are also created.
if stat.IsDir() {
if err := os.MkdirAll(dp, 0777); err != nil {
return err
}
continue
}
if err := copyFile(sp, dp); err != nil {
return err
}
}
return nil
}
func copyFile(src, dest string) error {
data, err := ioutil.ReadFile(src)
if err != nil {
return err
}
err = ioutil.WriteFile(dest, data, 0644)
if err != nil {
return err
}
return nil
}
// readDirs reads the source directory recursively and
// returns relative paths to all files and empty directories.
func readDirs(src string) ([]string, error) {
var files []string
err := filepath.Walk(src, func(path string, f os.FileInfo, err error) error {
relativePath := strings.TrimPrefix(path, src)
if len(relativePath) > 0 {
files = append(files, relativePath)
}
return nil
})
if err != nil {
return nil, err
}
return files, nil
}
// ReadDir returns the filenames in the given directory in sorted order.
func ReadDir(dirpath string) ([]string, error) {
dir, err := os.Open(dirpath)
if err != nil {
return nil, err
}
defer dir.Close()
names, err := dir.Readdirnames(-1)
if err != nil {
return nil, err
}
sort.Strings(names)
return names, nil
}
// Rename safely renames a file.
func Rename(from, to string) error {
if err := os.Rename(from, to); err != nil {
return err
}
// Directory was renamed; sync parent dir to persist rename.
pdir, err := OpenDir(filepath.Dir(to))
if err != nil {
return err
}
if err = pdir.Sync(); err != nil {
pdir.Close()
return err
}
return pdir.Close()
}
// Replace moves a file or directory to a new location and deletes any previous data.
// It is not atomic.
func Replace(from, to string) error {
// Remove destination only if it is a dir otherwise leave it to os.Rename
// as it replaces the destination file and is atomic.
{
f, err := os.Stat(to)
if !os.IsNotExist(err) {
if err == nil && f.IsDir() {
if err := os.RemoveAll(to); err != nil {
return err
}
}
}
}
if err := os.Rename(from, to); err != nil {
return err
}
// Directory was renamed; sync parent dir to persist rename.
pdir, err := OpenDir(filepath.Dir(to))
if err != nil {
return err
}
if err = pdir.Sync(); err != nil {
pdir.Close()
return err
}
return pdir.Close()
}

41
tsdb/fileutil/flock.go Normal file
View file

@ -0,0 +1,41 @@
// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fileutil
import (
"os"
"path/filepath"
)
// Releaser provides the Release method to release a file lock.
type Releaser interface {
Release() error
}
// Flock locks the file with the provided name. If the file does not exist, it is
// created. The returned Releaser is used to release the lock. existed is true
// if the file to lock already existed. A non-nil error is returned if the
// locking has failed. Neither this function nor the returned Releaser is
// goroutine-safe.
func Flock(fileName string) (r Releaser, existed bool, err error) {
if err = os.MkdirAll(filepath.Dir(fileName), 0755); err != nil {
return nil, false, err
}
_, err = os.Stat(fileName)
existed = err == nil
r, err = newLock(fileName)
return r, existed, err
}

View file

@ -0,0 +1,32 @@
// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fileutil
import "os"
type plan9Lock struct {
f *os.File
}
func (l *plan9Lock) Release() error {
return l.f.Close()
}
func newLock(fileName string) (Releaser, error) {
f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, os.ModeExclusive|0644)
if err != nil {
return nil, err
}
return &plan9Lock{f}, nil
}

View file

@ -0,0 +1,59 @@
// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build solaris
package fileutil
import (
"os"
"syscall"
)
type unixLock struct {
f *os.File
}
func (l *unixLock) Release() error {
if err := l.set(false); err != nil {
return err
}
return l.f.Close()
}
func (l *unixLock) set(lock bool) error {
flock := syscall.Flock_t{
Type: syscall.F_UNLCK,
Start: 0,
Len: 0,
Whence: 1,
}
if lock {
flock.Type = syscall.F_WRLCK
}
return syscall.FcntlFlock(l.f.Fd(), syscall.F_SETLK, &flock)
}
func newLock(fileName string) (Releaser, error) {
f, err := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE, 0644)
if err != nil {
return nil, err
}
l := &unixLock{f}
err = l.set(true)
if err != nil {
f.Close()
return nil, err
}
return l, nil
}

View file

@ -0,0 +1,80 @@
// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fileutil
import (
"os"
"path/filepath"
"testing"
"github.com/prometheus/tsdb/testutil"
)
func TestLocking(t *testing.T) {
dir := testutil.NewTemporaryDirectory("test_flock", t)
defer dir.Close()
fileName := filepath.Join(dir.Path(), "LOCK")
if _, err := os.Stat(fileName); err == nil {
t.Fatalf("File %q unexpectedly exists.", fileName)
}
lock, existed, err := Flock(fileName)
if err != nil {
t.Fatalf("Error locking file %q: %s", fileName, err)
}
if existed {
t.Errorf("File %q reported as existing during locking.", fileName)
}
// File must now exist.
if _, err = os.Stat(fileName); err != nil {
t.Errorf("Could not stat file %q expected to exist: %s", fileName, err)
}
// Try to lock again.
lockedAgain, existed, err := Flock(fileName)
if err == nil {
t.Fatalf("File %q locked twice.", fileName)
}
if lockedAgain != nil {
t.Error("Unsuccessful locking did not return nil.")
}
if !existed {
t.Errorf("Existing file %q not recognized.", fileName)
}
if err := lock.Release(); err != nil {
t.Errorf("Error releasing lock for file %q: %s", fileName, err)
}
// File must still exist.
if _, err = os.Stat(fileName); err != nil {
t.Errorf("Could not stat file %q expected to exist: %s", fileName, err)
}
// Lock existing file.
lock, existed, err = Flock(fileName)
if err != nil {
t.Fatalf("Error locking file %q: %s", fileName, err)
}
if !existed {
t.Errorf("Existing file %q not recognized.", fileName)
}
if err := lock.Release(); err != nil {
t.Errorf("Error releasing lock for file %q: %s", fileName, err)
}
}

View file

@ -0,0 +1,54 @@
// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build darwin dragonfly freebsd linux netbsd openbsd
package fileutil
import (
"os"
"syscall"
)
type unixLock struct {
f *os.File
}
func (l *unixLock) Release() error {
if err := l.set(false); err != nil {
return err
}
return l.f.Close()
}
func (l *unixLock) set(lock bool) error {
how := syscall.LOCK_UN
if lock {
how = syscall.LOCK_EX
}
return syscall.Flock(int(l.f.Fd()), how|syscall.LOCK_NB)
}
func newLock(fileName string) (Releaser, error) {
f, err := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE, 0644)
if err != nil {
return nil, err
}
l := &unixLock{f}
err = l.set(true)
if err != nil {
f.Close()
return nil, err
}
return l, nil
}

View file

@ -0,0 +1,36 @@
// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fileutil
import "syscall"
type windowsLock struct {
fd syscall.Handle
}
func (fl *windowsLock) Release() error {
return syscall.Close(fl.fd)
}
func newLock(fileName string) (Releaser, error) {
pathp, err := syscall.UTF16PtrFromString(fileName)
if err != nil {
return nil, err
}
fd, err := syscall.CreateFile(pathp, syscall.GENERIC_READ|syscall.GENERIC_WRITE, 0, nil, syscall.CREATE_ALWAYS, syscall.FILE_ATTRIBUTE_NORMAL, 0)
if err != nil {
return nil, err
}
return &windowsLock{fd}, nil
}

61
tsdb/fileutil/mmap.go Normal file
View file

@ -0,0 +1,61 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fileutil
import (
"os"
"github.com/pkg/errors"
)
type MmapFile struct {
f *os.File
b []byte
}
func OpenMmapFile(path string) (*MmapFile, error) {
f, err := os.Open(path)
if err != nil {
return nil, errors.Wrap(err, "try lock file")
}
info, err := f.Stat()
if err != nil {
return nil, errors.Wrap(err, "stat")
}
b, err := mmap(f, int(info.Size()))
if err != nil {
return nil, errors.Wrap(err, "mmap")
}
return &MmapFile{f: f, b: b}, nil
}
func (f *MmapFile) Close() error {
err0 := munmap(f.b)
err1 := f.f.Close()
if err0 != nil {
return err0
}
return err1
}
func (f *MmapFile) File() *os.File {
return f.f
}
func (f *MmapFile) Bytes() []byte {
return f.b
}

18
tsdb/fileutil/mmap_386.go Normal file
View file

@ -0,0 +1,18 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build windows
package fileutil
const maxMapSize = 0x7FFFFFFF // 2GB

View file

@ -0,0 +1,18 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build windows
package fileutil
const maxMapSize = 0xFFFFFFFFFFFF // 256TB

View file

@ -0,0 +1,30 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !windows,!plan9
package fileutil
import (
"os"
"golang.org/x/sys/unix"
)
func mmap(f *os.File, length int) ([]byte, error) {
return unix.Mmap(int(f.Fd()), 0, length, unix.PROT_READ, unix.MAP_SHARED)
}
func munmap(b []byte) (err error) {
return unix.Munmap(b)
}

View file

@ -0,0 +1,46 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fileutil
import (
"os"
"syscall"
"unsafe"
)
func mmap(f *os.File, size int) ([]byte, error) {
low, high := uint32(size), uint32(size>>32)
h, errno := syscall.CreateFileMapping(syscall.Handle(f.Fd()), nil, syscall.PAGE_READONLY, high, low, nil)
if h == 0 {
return nil, os.NewSyscallError("CreateFileMapping", errno)
}
addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(size))
if addr == 0 {
return nil, os.NewSyscallError("MapViewOfFile", errno)
}
if err := syscall.CloseHandle(syscall.Handle(h)); err != nil {
return nil, os.NewSyscallError("CloseHandle", err)
}
return (*[maxMapSize]byte)(unsafe.Pointer(addr))[:size], nil
}
func munmap(b []byte) error {
if err := syscall.UnmapViewOfFile((uintptr)(unsafe.Pointer(&b[0]))); err != nil {
return os.NewSyscallError("UnmapViewOfFile", err)
}
return nil
}

View file

@ -0,0 +1,54 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fileutil
import (
"io"
"os"
)
// Preallocate tries to allocate the space for given
// file. This operation is only supported on linux by a
// few filesystems (btrfs, ext4, etc.).
// If the operation is unsupported, no error will be returned.
// Otherwise, the error encountered will be returned.
func Preallocate(f *os.File, sizeInBytes int64, extendFile bool) error {
if sizeInBytes == 0 {
// fallocate will return EINVAL if length is 0; skip
return nil
}
if extendFile {
return preallocExtend(f, sizeInBytes)
}
return preallocFixed(f, sizeInBytes)
}
func preallocExtendTrunc(f *os.File, sizeInBytes int64) error {
curOff, err := f.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
size, err := f.Seek(sizeInBytes, io.SeekEnd)
if err != nil {
return err
}
if _, err = f.Seek(curOff, io.SeekStart); err != nil {
return err
}
if sizeInBytes > size {
return nil
}
return f.Truncate(sizeInBytes)
}

View file

@ -0,0 +1,41 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fileutil
import (
"os"
"syscall"
"unsafe"
)
func preallocExtend(f *os.File, sizeInBytes int64) error {
if err := preallocFixed(f, sizeInBytes); err != nil {
return err
}
return preallocExtendTrunc(f, sizeInBytes)
}
func preallocFixed(f *os.File, sizeInBytes int64) error {
fstore := &syscall.Fstore_t{
Flags: syscall.F_ALLOCATEALL,
Posmode: syscall.F_PEOFPOSMODE,
Length: sizeInBytes}
p := unsafe.Pointer(fstore)
_, _, errno := syscall.Syscall(syscall.SYS_FCNTL, f.Fd(), uintptr(syscall.F_PREALLOCATE), uintptr(p))
if errno == 0 || errno == syscall.ENOTSUP {
return nil
}
return errno
}

View file

@ -0,0 +1,47 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fileutil
import (
"os"
"syscall"
)
func preallocExtend(f *os.File, sizeInBytes int64) error {
// use mode = 0 to change size
err := syscall.Fallocate(int(f.Fd()), 0, 0, sizeInBytes)
if err != nil {
errno, ok := err.(syscall.Errno)
// not supported; fallback
// fallocate EINTRs frequently in some environments; fallback
if ok && (errno == syscall.ENOTSUP || errno == syscall.EINTR) {
return preallocExtendTrunc(f, sizeInBytes)
}
}
return err
}
func preallocFixed(f *os.File, sizeInBytes int64) error {
// use mode = 1 to keep size; see FALLOC_FL_KEEP_SIZE
err := syscall.Fallocate(int(f.Fd()), 1, 0, sizeInBytes)
if err != nil {
errno, ok := err.(syscall.Errno)
// treat not supported as nil error
if ok && errno == syscall.ENOTSUP {
return nil
}
}
return err
}

View file

@ -0,0 +1,25 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !linux,!darwin
package fileutil
import "os"
func preallocExtend(f *os.File, sizeInBytes int64) error {
return preallocExtendTrunc(f, sizeInBytes)
}
func preallocFixed(f *os.File, sizeInBytes int64) error { return nil }

24
tsdb/fileutil/sync.go Normal file
View file

@ -0,0 +1,24 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !linux,!darwin
package fileutil
import "os"
// Fdatasync is a wrapper around file.Sync(). Special handling is needed on linux platform.
func Fdatasync(f *os.File) error {
return f.Sync()
}

View file

@ -0,0 +1,27 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build darwin
package fileutil
import (
"os"
)
// Fdatasync on darwin platform invokes fcntl(F_FULLFSYNC) for actual persistence
// on physical drive media.
func Fdatasync(f *os.File) error {
return f.Sync()
}

View file

@ -0,0 +1,29 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build linux
package fileutil
import (
"os"
"syscall"
)
// Fdatasync is similar to fsync(), but does not flush modified metadata
// unless that metadata is needed in order to allow a subsequent data retrieval
// to be correctly handled.
func Fdatasync(f *os.File) error {
return syscall.Fdatasync(int(f.Fd()))
}

14
tsdb/go.mod Normal file
View file

@ -0,0 +1,14 @@
module github.com/prometheus/tsdb
require (
github.com/cespare/xxhash v1.1.0
github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954
github.com/go-kit/kit v0.8.0
github.com/golang/snappy v0.0.1
github.com/oklog/ulid v1.3.1
github.com/pkg/errors v0.8.0
github.com/prometheus/client_golang v1.0.0
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4
golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5
gopkg.in/alecthomas/kingpin.v2 v2.2.6
)

83
tsdb/go.sum Normal file
View file

@ -0,0 +1,83 @@
github.com/OneOfOne/xxhash v1.2.2 h1:KMrpdQIwFcEqXDklaen+P1axHaj9BSKzvpUUfnHldSE=
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc h1:cAKDfWh5VpdgMhJosfJnn5/FoN2SRZ4p7fJNX58YPaU=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf h1:qet1QNfXsQxTZqLG4oE62mJzwPIB8+Tee4RNCL9ulrY=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973 h1:xJ4a3vCFaGF/jqvzLMYoU8P317H5OQ+Via4RmuPwCS0=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0 h1:HWo1m869IqiPhD389kmkxeTalrjNbbJTC8LXupb+sl0=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko=
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954 h1:RMLoZVzv4GliuWafOuPuQDKSm1SJph7uCRnnS61JAn4=
github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
github.com/go-kit/kit v0.8.0 h1:Wz+5lgoB0kkuqLEc6NVmwRknTKP6dTGbSqvhZtBI/j0=
github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
github.com/go-logfmt/logfmt v0.3.0 h1:8HUsc87TaSWLKwrnumgC8/YconD2fJQsRJAsWaPg2ic=
github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
github.com/go-stack/stack v1.8.0 h1:5SgMzNM5HxrEjV0ww2lTmX6E2Izsfxas4+YHWRs3Lsk=
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
github.com/gogo/protobuf v1.1.1 h1:72R+M5VuhED/KujmZVcIquuo8mBgX4oVda//DQb3PXo=
github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515 h1:T+h1c/A9Gawja4Y9mFVWj2vyii2bbUNDw3kt9VxK2EY=
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4=
github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
github.com/pkg/errors v0.8.0 h1:WdK/asTD0HN+q6hsWO3/vpuAkAr+tw6aNJNDFFf0+qw=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v0.9.1 h1:K47Rk0v/fkEfwfQet2KWhscE0cJzjgCCDBG2KHZoVno=
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_golang v1.0.0 h1:vrDKnkGzuGvhNAL56c7DBz29ZL+KxnoR0x7enabFceM=
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910 h1:idejC8f05m9MGOsuEi1ATq9shN03HrxNkD/luQvxCv8=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90 h1:S/YWwWx/RA8rT8tKFRuGUZhuA90OyIBpPCXkcbwU8DE=
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/common v0.4.1 h1:K0MGApIoQvMw27RTdJkPbr3JZ7DNbtxQNyi5STVM6Kw=
github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d h1:GoAlyOgbOEIFdaDqxJVlbOQ1DtGmZWs/Qau0hIlk+WQ=
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.0.2 h1:6LJUbpNm42llc4HRCuvApCSWB/WfhuNo9K98Q9sNGfs=
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72 h1:qLC7fQah7D6K1B0ujays3HV9gkFtllcxhzImRR7ArPQ=
github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f h1:Bl/8QSvNqXvPGPGXa2z5xUTmV7VDcZyvRZ+QQXkXTZQ=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 h1:YUO/7uOKsKeq9UokNS62b8FYywz3ker1l1vDZRCRefw=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5 h1:mzjBh+S5frKOsOBobWIMAbXavqjmgO17k/2puhcFR94=
golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
gopkg.in/alecthomas/kingpin.v2 v2.2.6 h1:jMFz6MfLP0/4fUyZle81rXUoxOBFi19VUFKVDOQfozc=
gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=

View file

@ -0,0 +1,27 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package goversion_test
import (
"testing"
_ "github.com/prometheus/tsdb/goversion"
)
// This test is is intentionally blank and exists only so `go test` believes
// there is something to test.
//
// The blank import above is actually what invokes the test of this package. If
// the import succeeds (the code compiles), the test passed.
func Test(t *testing.T) {}

View file

@ -0,0 +1,19 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build go1.12
// Package goversion enforces the go version suported by the tsdb module.
package goversion
const _SoftwareRequiresGOVERSION1_12 = uint8(0)

17
tsdb/goversion/init.go Normal file
View file

@ -0,0 +1,17 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package goversion
// This will fail to compile if the Go runtime version isn't >= 1.12.
var _ = _SoftwareRequiresGOVERSION1_12

1873
tsdb/head.go Normal file

File diff suppressed because it is too large Load diff

120
tsdb/head_bench_test.go Normal file
View file

@ -0,0 +1,120 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"strconv"
"sync/atomic"
"testing"
"github.com/prometheus/tsdb/labels"
"github.com/prometheus/tsdb/testutil"
)
func BenchmarkHeadStripeSeriesCreate(b *testing.B) {
// Put a series, select it. GC it and then access it.
h, err := NewHead(nil, nil, nil, 1000)
testutil.Ok(b, err)
defer h.Close()
for i := 0; i < b.N; i++ {
h.getOrCreate(uint64(i), labels.FromStrings("a", strconv.Itoa(i)))
}
}
func BenchmarkHeadStripeSeriesCreateParallel(b *testing.B) {
// Put a series, select it. GC it and then access it.
h, err := NewHead(nil, nil, nil, 1000)
testutil.Ok(b, err)
defer h.Close()
var count int64
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
i := atomic.AddInt64(&count, 1)
h.getOrCreate(uint64(i), labels.FromStrings("a", strconv.Itoa(int(i))))
}
})
}
func BenchmarkHeadPostingForMatchers(b *testing.B) {
h, err := NewHead(nil, nil, nil, 1000)
testutil.Ok(b, err)
defer func() {
testutil.Ok(b, h.Close())
}()
var ref uint64
addSeries := func(l labels.Labels) {
ref++
h.getOrCreateWithID(ref, l.Hash(), l)
}
for n := 0; n < 10; n++ {
for i := 0; i < 100000; i++ {
addSeries(labels.FromStrings("i", strconv.Itoa(i), "n", strconv.Itoa(n), "j", "foo"))
// Have some series that won't be matched, to properly test inverted matches.
addSeries(labels.FromStrings("i", strconv.Itoa(i), "n", strconv.Itoa(n), "j", "bar"))
addSeries(labels.FromStrings("i", strconv.Itoa(i), "n", "0_"+strconv.Itoa(n), "j", "bar"))
addSeries(labels.FromStrings("i", strconv.Itoa(i), "n", "1_"+strconv.Itoa(n), "j", "bar"))
addSeries(labels.FromStrings("i", strconv.Itoa(i), "n", "2_"+strconv.Itoa(n), "j", "foo"))
}
}
n1 := labels.NewEqualMatcher("n", "1")
jFoo := labels.NewEqualMatcher("j", "foo")
jNotFoo := labels.Not(jFoo)
iStar := labels.NewMustRegexpMatcher("i", "^.*$")
iPlus := labels.NewMustRegexpMatcher("i", "^.+$")
i1Plus := labels.NewMustRegexpMatcher("i", "^1.+$")
iEmptyRe := labels.NewMustRegexpMatcher("i", "^$")
iNotEmpty := labels.Not(labels.NewEqualMatcher("i", ""))
iNot2 := labels.Not(labels.NewEqualMatcher("n", "2"))
iNot2Star := labels.Not(labels.NewMustRegexpMatcher("i", "^2.*$"))
cases := []struct {
name string
matchers []labels.Matcher
}{
{`n="1"`, []labels.Matcher{n1}},
{`n="1",j="foo"`, []labels.Matcher{n1, jFoo}},
{`j="foo",n="1"`, []labels.Matcher{jFoo, n1}},
{`n="1",j!="foo"`, []labels.Matcher{n1, jNotFoo}},
{`i=~".*"`, []labels.Matcher{iStar}},
{`i=~".+"`, []labels.Matcher{iPlus}},
{`i=~""`, []labels.Matcher{iEmptyRe}},
{`i!=""`, []labels.Matcher{iNotEmpty}},
{`n="1",i=~".*",j="foo"`, []labels.Matcher{n1, iStar, jFoo}},
{`n="1",i=~".*",i!="2",j="foo"`, []labels.Matcher{n1, iStar, iNot2, jFoo}},
{`n="1",i!=""`, []labels.Matcher{n1, iNotEmpty}},
{`n="1",i!="",j="foo"`, []labels.Matcher{n1, iNotEmpty, jFoo}},
{`n="1",i=~".+",j="foo"`, []labels.Matcher{n1, iPlus, jFoo}},
{`n="1",i=~"1.+",j="foo"`, []labels.Matcher{n1, i1Plus, jFoo}},
{`n="1",i=~".+",i!="2",j="foo"`, []labels.Matcher{n1, iPlus, iNot2, jFoo}},
{`n="1",i=~".+",i!~"2.*",j="foo"`, []labels.Matcher{n1, iPlus, iNot2Star, jFoo}},
}
for _, c := range cases {
b.Run(c.name, func(b *testing.B) {
for i := 0; i < b.N; i++ {
_, err := PostingsForMatchers(h.indexRange(0, 1000), c.matchers...)
testutil.Ok(b, err)
}
})
}
}

1209
tsdb/head_test.go Normal file

File diff suppressed because it is too large Load diff

1134
tsdb/index/index.go Normal file

File diff suppressed because it is too large Load diff

429
tsdb/index/index_test.go Normal file
View file

@ -0,0 +1,429 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package index
import (
"io/ioutil"
"math/rand"
"os"
"path/filepath"
"sort"
"testing"
"github.com/pkg/errors"
"github.com/prometheus/tsdb/chunkenc"
"github.com/prometheus/tsdb/chunks"
"github.com/prometheus/tsdb/encoding"
"github.com/prometheus/tsdb/labels"
"github.com/prometheus/tsdb/testutil"
)
type series struct {
l labels.Labels
chunks []chunks.Meta
}
type mockIndex struct {
series map[uint64]series
labelIndex map[string][]string
postings map[labels.Label][]uint64
symbols map[string]struct{}
}
func newMockIndex() mockIndex {
ix := mockIndex{
series: make(map[uint64]series),
labelIndex: make(map[string][]string),
postings: make(map[labels.Label][]uint64),
symbols: make(map[string]struct{}),
}
return ix
}
func (m mockIndex) Symbols() (map[string]struct{}, error) {
return m.symbols, nil
}
func (m mockIndex) AddSeries(ref uint64, l labels.Labels, chunks ...chunks.Meta) error {
if _, ok := m.series[ref]; ok {
return errors.Errorf("series with reference %d already added", ref)
}
for _, lbl := range l {
m.symbols[lbl.Name] = struct{}{}
m.symbols[lbl.Value] = struct{}{}
}
s := series{l: l}
// Actual chunk data is not stored in the index.
for _, c := range chunks {
c.Chunk = nil
s.chunks = append(s.chunks, c)
}
m.series[ref] = s
return nil
}
func (m mockIndex) WriteLabelIndex(names []string, values []string) error {
// TODO support composite indexes
if len(names) != 1 {
return errors.New("composite indexes not supported yet")
}
sort.Strings(values)
m.labelIndex[names[0]] = values
return nil
}
func (m mockIndex) WritePostings(name, value string, it Postings) error {
l := labels.Label{Name: name, Value: value}
if _, ok := m.postings[l]; ok {
return errors.Errorf("postings for %s already added", l)
}
ep, err := ExpandPostings(it)
if err != nil {
return err
}
m.postings[l] = ep
return nil
}
func (m mockIndex) Close() error {
return nil
}
func (m mockIndex) LabelValues(names ...string) (StringTuples, error) {
// TODO support composite indexes
if len(names) != 1 {
return nil, errors.New("composite indexes not supported yet")
}
return NewStringTuples(m.labelIndex[names[0]], 1)
}
func (m mockIndex) Postings(name, value string) (Postings, error) {
l := labels.Label{Name: name, Value: value}
return NewListPostings(m.postings[l]), nil
}
func (m mockIndex) SortedPostings(p Postings) Postings {
ep, err := ExpandPostings(p)
if err != nil {
return ErrPostings(errors.Wrap(err, "expand postings"))
}
sort.Slice(ep, func(i, j int) bool {
return labels.Compare(m.series[ep[i]].l, m.series[ep[j]].l) < 0
})
return NewListPostings(ep)
}
func (m mockIndex) Series(ref uint64, lset *labels.Labels, chks *[]chunks.Meta) error {
s, ok := m.series[ref]
if !ok {
return errors.New("not found")
}
*lset = append((*lset)[:0], s.l...)
*chks = append((*chks)[:0], s.chunks...)
return nil
}
func (m mockIndex) LabelIndices() ([][]string, error) {
res := make([][]string, 0, len(m.labelIndex))
for k := range m.labelIndex {
res = append(res, []string{k})
}
return res, nil
}
func TestIndexRW_Create_Open(t *testing.T) {
dir, err := ioutil.TempDir("", "test_index_create")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
fn := filepath.Join(dir, indexFilename)
// An empty index must still result in a readable file.
iw, err := NewWriter(fn)
testutil.Ok(t, err)
testutil.Ok(t, iw.Close())
ir, err := NewFileReader(fn)
testutil.Ok(t, err)
testutil.Ok(t, ir.Close())
// Modify magic header must cause open to fail.
f, err := os.OpenFile(fn, os.O_WRONLY, 0666)
testutil.Ok(t, err)
_, err = f.WriteAt([]byte{0, 0}, 0)
testutil.Ok(t, err)
f.Close()
_, err = NewFileReader(dir)
testutil.NotOk(t, err)
}
func TestIndexRW_Postings(t *testing.T) {
dir, err := ioutil.TempDir("", "test_index_postings")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
fn := filepath.Join(dir, indexFilename)
iw, err := NewWriter(fn)
testutil.Ok(t, err)
series := []labels.Labels{
labels.FromStrings("a", "1", "b", "1"),
labels.FromStrings("a", "1", "b", "2"),
labels.FromStrings("a", "1", "b", "3"),
labels.FromStrings("a", "1", "b", "4"),
}
err = iw.AddSymbols(map[string]struct{}{
"a": {},
"b": {},
"1": {},
"2": {},
"3": {},
"4": {},
})
testutil.Ok(t, err)
// Postings lists are only written if a series with the respective
// reference was added before.
testutil.Ok(t, iw.AddSeries(1, series[0]))
testutil.Ok(t, iw.AddSeries(2, series[1]))
testutil.Ok(t, iw.AddSeries(3, series[2]))
testutil.Ok(t, iw.AddSeries(4, series[3]))
err = iw.WritePostings("a", "1", newListPostings(1, 2, 3, 4))
testutil.Ok(t, err)
testutil.Ok(t, iw.Close())
ir, err := NewFileReader(fn)
testutil.Ok(t, err)
p, err := ir.Postings("a", "1")
testutil.Ok(t, err)
var l labels.Labels
var c []chunks.Meta
for i := 0; p.Next(); i++ {
err := ir.Series(p.At(), &l, &c)
testutil.Ok(t, err)
testutil.Equals(t, 0, len(c))
testutil.Equals(t, series[i], l)
}
testutil.Ok(t, p.Err())
testutil.Ok(t, ir.Close())
}
func TestPersistence_index_e2e(t *testing.T) {
dir, err := ioutil.TempDir("", "test_persistence_e2e")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
lbls, err := labels.ReadLabels(filepath.Join("..", "testdata", "20kseries.json"), 20000)
testutil.Ok(t, err)
// Sort labels as the index writer expects series in sorted order.
sort.Sort(labels.Slice(lbls))
symbols := map[string]struct{}{}
for _, lset := range lbls {
for _, l := range lset {
symbols[l.Name] = struct{}{}
symbols[l.Value] = struct{}{}
}
}
var input indexWriterSeriesSlice
// Generate ChunkMetas for every label set.
for i, lset := range lbls {
var metas []chunks.Meta
for j := 0; j <= (i % 20); j++ {
metas = append(metas, chunks.Meta{
MinTime: int64(j * 10000),
MaxTime: int64((j + 1) * 10000),
Ref: rand.Uint64(),
Chunk: chunkenc.NewXORChunk(),
})
}
input = append(input, &indexWriterSeries{
labels: lset,
chunks: metas,
})
}
iw, err := NewWriter(filepath.Join(dir, indexFilename))
testutil.Ok(t, err)
testutil.Ok(t, iw.AddSymbols(symbols))
// Population procedure as done by compaction.
var (
postings = NewMemPostings()
values = map[string]map[string]struct{}{}
)
mi := newMockIndex()
for i, s := range input {
err = iw.AddSeries(uint64(i), s.labels, s.chunks...)
testutil.Ok(t, err)
testutil.Ok(t, mi.AddSeries(uint64(i), s.labels, s.chunks...))
for _, l := range s.labels {
valset, ok := values[l.Name]
if !ok {
valset = map[string]struct{}{}
values[l.Name] = valset
}
valset[l.Value] = struct{}{}
}
postings.Add(uint64(i), s.labels)
}
for k, v := range values {
var vals []string
for e := range v {
vals = append(vals, e)
}
sort.Strings(vals)
testutil.Ok(t, iw.WriteLabelIndex([]string{k}, vals))
testutil.Ok(t, mi.WriteLabelIndex([]string{k}, vals))
}
all := make([]uint64, len(lbls))
for i := range all {
all[i] = uint64(i)
}
err = iw.WritePostings("", "", newListPostings(all...))
testutil.Ok(t, err)
testutil.Ok(t, mi.WritePostings("", "", newListPostings(all...)))
for n, e := range postings.m {
for v := range e {
err = iw.WritePostings(n, v, postings.Get(n, v))
testutil.Ok(t, err)
mi.WritePostings(n, v, postings.Get(n, v))
}
}
err = iw.Close()
testutil.Ok(t, err)
ir, err := NewFileReader(filepath.Join(dir, indexFilename))
testutil.Ok(t, err)
for p := range mi.postings {
gotp, err := ir.Postings(p.Name, p.Value)
testutil.Ok(t, err)
expp, err := mi.Postings(p.Name, p.Value)
testutil.Ok(t, err)
var lset, explset labels.Labels
var chks, expchks []chunks.Meta
for gotp.Next() {
testutil.Assert(t, expp.Next() == true, "")
ref := gotp.At()
err := ir.Series(ref, &lset, &chks)
testutil.Ok(t, err)
err = mi.Series(expp.At(), &explset, &expchks)
testutil.Ok(t, err)
testutil.Equals(t, explset, lset)
testutil.Equals(t, expchks, chks)
}
testutil.Assert(t, expp.Next() == false, "")
testutil.Ok(t, gotp.Err())
}
for k, v := range mi.labelIndex {
tplsExp, err := NewStringTuples(v, 1)
testutil.Ok(t, err)
tplsRes, err := ir.LabelValues(k)
testutil.Ok(t, err)
testutil.Equals(t, tplsExp.Len(), tplsRes.Len())
for i := 0; i < tplsExp.Len(); i++ {
strsExp, err := tplsExp.At(i)
testutil.Ok(t, err)
strsRes, err := tplsRes.At(i)
testutil.Ok(t, err)
testutil.Equals(t, strsExp, strsRes)
}
}
gotSymbols, err := ir.Symbols()
testutil.Ok(t, err)
testutil.Equals(t, len(mi.symbols), len(gotSymbols))
for s := range mi.symbols {
_, ok := gotSymbols[s]
testutil.Assert(t, ok, "")
}
testutil.Ok(t, ir.Close())
}
func TestDecbufUvariantWithInvalidBuffer(t *testing.T) {
b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81})
db := encoding.NewDecbufUvarintAt(b, 0, castagnoliTable)
testutil.NotOk(t, db.Err())
}
func TestReaderWithInvalidBuffer(t *testing.T) {
b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81})
_, err := NewReader(b)
testutil.NotOk(t, err)
}
// TestNewFileReaderErrorNoOpenFiles ensures that in case of an error no file remains open.
func TestNewFileReaderErrorNoOpenFiles(t *testing.T) {
dir := testutil.NewTemporaryDirectory("block", t)
idxName := filepath.Join(dir.Path(), "index")
err := ioutil.WriteFile(idxName, []byte("corrupted contents"), 0644)
testutil.Ok(t, err)
_, err = NewFileReader(idxName)
testutil.NotOk(t, err)
// dir.Close will fail on Win if idxName fd is not closed on error path.
dir.Close()
}

691
tsdb/index/postings.go Normal file
View file

@ -0,0 +1,691 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package index
import (
"container/heap"
"encoding/binary"
"runtime"
"sort"
"strings"
"sync"
"github.com/prometheus/tsdb/labels"
)
var allPostingsKey = labels.Label{}
// AllPostingsKey returns the label key that is used to store the postings list of all existing IDs.
func AllPostingsKey() (name, value string) {
return allPostingsKey.Name, allPostingsKey.Value
}
// MemPostings holds postings list for series ID per label pair. They may be written
// to out of order.
// ensureOrder() must be called once before any reads are done. This allows for quick
// unordered batch fills on startup.
type MemPostings struct {
mtx sync.RWMutex
m map[string]map[string][]uint64
ordered bool
}
// NewMemPostings returns a memPostings that's ready for reads and writes.
func NewMemPostings() *MemPostings {
return &MemPostings{
m: make(map[string]map[string][]uint64, 512),
ordered: true,
}
}
// NewUnorderedMemPostings returns a memPostings that is not safe to be read from
// until ensureOrder was called once.
func NewUnorderedMemPostings() *MemPostings {
return &MemPostings{
m: make(map[string]map[string][]uint64, 512),
ordered: false,
}
}
// SortedKeys returns a list of sorted label keys of the postings.
func (p *MemPostings) SortedKeys() []labels.Label {
p.mtx.RLock()
keys := make([]labels.Label, 0, len(p.m))
for n, e := range p.m {
for v := range e {
keys = append(keys, labels.Label{Name: n, Value: v})
}
}
p.mtx.RUnlock()
sort.Slice(keys, func(i, j int) bool {
if d := strings.Compare(keys[i].Name, keys[j].Name); d != 0 {
return d < 0
}
return keys[i].Value < keys[j].Value
})
return keys
}
// Get returns a postings list for the given label pair.
func (p *MemPostings) Get(name, value string) Postings {
var lp []uint64
p.mtx.RLock()
l := p.m[name]
if l != nil {
lp = l[value]
}
p.mtx.RUnlock()
if lp == nil {
return EmptyPostings()
}
return newListPostings(lp...)
}
// All returns a postings list over all documents ever added.
func (p *MemPostings) All() Postings {
return p.Get(AllPostingsKey())
}
// EnsureOrder ensures that all postings lists are sorted. After it returns all further
// calls to add and addFor will insert new IDs in a sorted manner.
func (p *MemPostings) EnsureOrder() {
p.mtx.Lock()
defer p.mtx.Unlock()
if p.ordered {
return
}
n := runtime.GOMAXPROCS(0)
workc := make(chan []uint64)
var wg sync.WaitGroup
wg.Add(n)
for i := 0; i < n; i++ {
go func() {
for l := range workc {
sort.Slice(l, func(i, j int) bool { return l[i] < l[j] })
}
wg.Done()
}()
}
for _, e := range p.m {
for _, l := range e {
workc <- l
}
}
close(workc)
wg.Wait()
p.ordered = true
}
// Delete removes all ids in the given map from the postings lists.
func (p *MemPostings) Delete(deleted map[uint64]struct{}) {
var keys, vals []string
// Collect all keys relevant for deletion once. New keys added afterwards
// can by definition not be affected by any of the given deletes.
p.mtx.RLock()
for n := range p.m {
keys = append(keys, n)
}
p.mtx.RUnlock()
for _, n := range keys {
p.mtx.RLock()
vals = vals[:0]
for v := range p.m[n] {
vals = append(vals, v)
}
p.mtx.RUnlock()
// For each posting we first analyse whether the postings list is affected by the deletes.
// If yes, we actually reallocate a new postings list.
for _, l := range vals {
// Only lock for processing one postings list so we don't block reads for too long.
p.mtx.Lock()
found := false
for _, id := range p.m[n][l] {
if _, ok := deleted[id]; ok {
found = true
break
}
}
if !found {
p.mtx.Unlock()
continue
}
repl := make([]uint64, 0, len(p.m[n][l]))
for _, id := range p.m[n][l] {
if _, ok := deleted[id]; !ok {
repl = append(repl, id)
}
}
if len(repl) > 0 {
p.m[n][l] = repl
} else {
delete(p.m[n], l)
}
p.mtx.Unlock()
}
p.mtx.Lock()
if len(p.m[n]) == 0 {
delete(p.m, n)
}
p.mtx.Unlock()
}
}
// Iter calls f for each postings list. It aborts if f returns an error and returns it.
func (p *MemPostings) Iter(f func(labels.Label, Postings) error) error {
p.mtx.RLock()
defer p.mtx.RUnlock()
for n, e := range p.m {
for v, p := range e {
if err := f(labels.Label{Name: n, Value: v}, newListPostings(p...)); err != nil {
return err
}
}
}
return nil
}
// Add a label set to the postings index.
func (p *MemPostings) Add(id uint64, lset labels.Labels) {
p.mtx.Lock()
for _, l := range lset {
p.addFor(id, l)
}
p.addFor(id, allPostingsKey)
p.mtx.Unlock()
}
func (p *MemPostings) addFor(id uint64, l labels.Label) {
nm, ok := p.m[l.Name]
if !ok {
nm = map[string][]uint64{}
p.m[l.Name] = nm
}
list := append(nm[l.Value], id)
nm[l.Value] = list
if !p.ordered {
return
}
// There is no guarantee that no higher ID was inserted before as they may
// be generated independently before adding them to postings.
// We repair order violations on insert. The invariant is that the first n-1
// items in the list are already sorted.
for i := len(list) - 1; i >= 1; i-- {
if list[i] >= list[i-1] {
break
}
list[i], list[i-1] = list[i-1], list[i]
}
}
// ExpandPostings returns the postings expanded as a slice.
func ExpandPostings(p Postings) (res []uint64, err error) {
for p.Next() {
res = append(res, p.At())
}
return res, p.Err()
}
// Postings provides iterative access over a postings list.
type Postings interface {
// Next advances the iterator and returns true if another value was found.
Next() bool
// Seek advances the iterator to value v or greater and returns
// true if a value was found.
Seek(v uint64) bool
// At returns the value at the current iterator position.
At() uint64
// Err returns the last error of the iterator.
Err() error
}
// errPostings is an empty iterator that always errors.
type errPostings struct {
err error
}
func (e errPostings) Next() bool { return false }
func (e errPostings) Seek(uint64) bool { return false }
func (e errPostings) At() uint64 { return 0 }
func (e errPostings) Err() error { return e.err }
var emptyPostings = errPostings{}
// EmptyPostings returns a postings list that's always empty.
// NOTE: Returning EmptyPostings sentinel when index.Postings struct has no postings is recommended.
// It triggers optimized flow in other functions like Intersect, Without etc.
func EmptyPostings() Postings {
return emptyPostings
}
// ErrPostings returns new postings that immediately error.
func ErrPostings(err error) Postings {
return errPostings{err}
}
// Intersect returns a new postings list over the intersection of the
// input postings.
func Intersect(its ...Postings) Postings {
if len(its) == 0 {
return EmptyPostings()
}
if len(its) == 1 {
return its[0]
}
for _, p := range its {
if p == EmptyPostings() {
return EmptyPostings()
}
}
return newIntersectPostings(its...)
}
type intersectPostings struct {
arr []Postings
cur uint64
}
func newIntersectPostings(its ...Postings) *intersectPostings {
return &intersectPostings{arr: its}
}
func (it *intersectPostings) At() uint64 {
return it.cur
}
func (it *intersectPostings) doNext() bool {
Loop:
for {
for _, p := range it.arr {
if !p.Seek(it.cur) {
return false
}
if p.At() > it.cur {
it.cur = p.At()
continue Loop
}
}
return true
}
}
func (it *intersectPostings) Next() bool {
for _, p := range it.arr {
if !p.Next() {
return false
}
if p.At() > it.cur {
it.cur = p.At()
}
}
return it.doNext()
}
func (it *intersectPostings) Seek(id uint64) bool {
it.cur = id
return it.doNext()
}
func (it *intersectPostings) Err() error {
for _, p := range it.arr {
if p.Err() != nil {
return p.Err()
}
}
return nil
}
// Merge returns a new iterator over the union of the input iterators.
func Merge(its ...Postings) Postings {
if len(its) == 0 {
return EmptyPostings()
}
if len(its) == 1 {
return its[0]
}
p, ok := newMergedPostings(its)
if !ok {
return EmptyPostings()
}
return p
}
type postingsHeap []Postings
func (h postingsHeap) Len() int { return len(h) }
func (h postingsHeap) Less(i, j int) bool { return h[i].At() < h[j].At() }
func (h *postingsHeap) Swap(i, j int) { (*h)[i], (*h)[j] = (*h)[j], (*h)[i] }
func (h *postingsHeap) Push(x interface{}) {
*h = append(*h, x.(Postings))
}
func (h *postingsHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
type mergedPostings struct {
h postingsHeap
initilized bool
cur uint64
err error
}
func newMergedPostings(p []Postings) (m *mergedPostings, nonEmpty bool) {
ph := make(postingsHeap, 0, len(p))
for _, it := range p {
// NOTE: mergedPostings struct requires the user to issue an initial Next.
if it.Next() {
ph = append(ph, it)
} else {
if it.Err() != nil {
return &mergedPostings{err: it.Err()}, true
}
}
}
if len(ph) == 0 {
return nil, false
}
return &mergedPostings{h: ph}, true
}
func (it *mergedPostings) Next() bool {
if it.h.Len() == 0 || it.err != nil {
return false
}
// The user must issue an initial Next.
if !it.initilized {
heap.Init(&it.h)
it.cur = it.h[0].At()
it.initilized = true
return true
}
for {
cur := it.h[0]
if !cur.Next() {
heap.Pop(&it.h)
if cur.Err() != nil {
it.err = cur.Err()
return false
}
if it.h.Len() == 0 {
return false
}
} else {
// Value of top of heap has changed, re-heapify.
heap.Fix(&it.h, 0)
}
if it.h[0].At() != it.cur {
it.cur = it.h[0].At()
return true
}
}
}
func (it *mergedPostings) Seek(id uint64) bool {
if it.h.Len() == 0 || it.err != nil {
return false
}
if !it.initilized {
if !it.Next() {
return false
}
}
for it.cur < id {
cur := it.h[0]
if !cur.Seek(id) {
heap.Pop(&it.h)
if cur.Err() != nil {
it.err = cur.Err()
return false
}
if it.h.Len() == 0 {
return false
}
} else {
// Value of top of heap has changed, re-heapify.
heap.Fix(&it.h, 0)
}
it.cur = it.h[0].At()
}
return true
}
func (it mergedPostings) At() uint64 {
return it.cur
}
func (it mergedPostings) Err() error {
return it.err
}
// Without returns a new postings list that contains all elements from the full list that
// are not in the drop list.
func Without(full, drop Postings) Postings {
if full == EmptyPostings() {
return EmptyPostings()
}
if drop == EmptyPostings() {
return full
}
return newRemovedPostings(full, drop)
}
type removedPostings struct {
full, remove Postings
cur uint64
initialized bool
fok, rok bool
}
func newRemovedPostings(full, remove Postings) *removedPostings {
return &removedPostings{
full: full,
remove: remove,
}
}
func (rp *removedPostings) At() uint64 {
return rp.cur
}
func (rp *removedPostings) Next() bool {
if !rp.initialized {
rp.fok = rp.full.Next()
rp.rok = rp.remove.Next()
rp.initialized = true
}
for {
if !rp.fok {
return false
}
if !rp.rok {
rp.cur = rp.full.At()
rp.fok = rp.full.Next()
return true
}
fcur, rcur := rp.full.At(), rp.remove.At()
if fcur < rcur {
rp.cur = fcur
rp.fok = rp.full.Next()
return true
} else if rcur < fcur {
// Forward the remove postings to the right position.
rp.rok = rp.remove.Seek(fcur)
} else {
// Skip the current posting.
rp.fok = rp.full.Next()
}
}
}
func (rp *removedPostings) Seek(id uint64) bool {
if rp.cur >= id {
return true
}
rp.fok = rp.full.Seek(id)
rp.rok = rp.remove.Seek(id)
rp.initialized = true
return rp.Next()
}
func (rp *removedPostings) Err() error {
if rp.full.Err() != nil {
return rp.full.Err()
}
return rp.remove.Err()
}
// ListPostings implements the Postings interface over a plain list.
type ListPostings struct {
list []uint64
cur uint64
}
func NewListPostings(list []uint64) Postings {
return newListPostings(list...)
}
func newListPostings(list ...uint64) *ListPostings {
return &ListPostings{list: list}
}
func (it *ListPostings) At() uint64 {
return it.cur
}
func (it *ListPostings) Next() bool {
if len(it.list) > 0 {
it.cur = it.list[0]
it.list = it.list[1:]
return true
}
it.cur = 0
return false
}
func (it *ListPostings) Seek(x uint64) bool {
// If the current value satisfies, then return.
if it.cur >= x {
return true
}
if len(it.list) == 0 {
return false
}
// Do binary search between current position and end.
i := sort.Search(len(it.list), func(i int) bool {
return it.list[i] >= x
})
if i < len(it.list) {
it.cur = it.list[i]
it.list = it.list[i+1:]
return true
}
it.list = nil
return false
}
func (it *ListPostings) Err() error {
return nil
}
// bigEndianPostings implements the Postings interface over a byte stream of
// big endian numbers.
type bigEndianPostings struct {
list []byte
cur uint32
}
func newBigEndianPostings(list []byte) *bigEndianPostings {
return &bigEndianPostings{list: list}
}
func (it *bigEndianPostings) At() uint64 {
return uint64(it.cur)
}
func (it *bigEndianPostings) Next() bool {
if len(it.list) >= 4 {
it.cur = binary.BigEndian.Uint32(it.list)
it.list = it.list[4:]
return true
}
return false
}
func (it *bigEndianPostings) Seek(x uint64) bool {
if uint64(it.cur) >= x {
return true
}
num := len(it.list) / 4
// Do binary search between current position and end.
i := sort.Search(num, func(i int) bool {
return binary.BigEndian.Uint32(it.list[i*4:]) >= uint32(x)
})
if i < num {
j := i * 4
it.cur = binary.BigEndian.Uint32(it.list[j:])
it.list = it.list[j+4:]
return true
}
it.list = nil
return false
}
func (it *bigEndianPostings) Err() error {
return nil
}

814
tsdb/index/postings_test.go Normal file
View file

@ -0,0 +1,814 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package index
import (
"encoding/binary"
"fmt"
"math/rand"
"sort"
"testing"
"github.com/prometheus/tsdb/testutil"
)
func TestMemPostings_addFor(t *testing.T) {
p := NewMemPostings()
p.m[allPostingsKey.Name] = map[string][]uint64{}
p.m[allPostingsKey.Name][allPostingsKey.Value] = []uint64{1, 2, 3, 4, 6, 7, 8}
p.addFor(5, allPostingsKey)
testutil.Equals(t, []uint64{1, 2, 3, 4, 5, 6, 7, 8}, p.m[allPostingsKey.Name][allPostingsKey.Value])
}
func TestMemPostings_ensureOrder(t *testing.T) {
p := NewUnorderedMemPostings()
p.m["a"] = map[string][]uint64{}
for i := 0; i < 100; i++ {
l := make([]uint64, 100)
for j := range l {
l[j] = rand.Uint64()
}
v := fmt.Sprintf("%d", i)
p.m["a"][v] = l
}
p.EnsureOrder()
for _, e := range p.m {
for _, l := range e {
ok := sort.SliceIsSorted(l, func(i, j int) bool {
return l[i] < l[j]
})
if !ok {
t.Fatalf("postings list %v is not sorted", l)
}
}
}
}
func TestIntersect(t *testing.T) {
a := newListPostings(1, 2, 3)
b := newListPostings(2, 3, 4)
var cases = []struct {
in []Postings
res Postings
}{
{
in: []Postings{},
res: EmptyPostings(),
},
{
in: []Postings{a, b, EmptyPostings()},
res: EmptyPostings(),
},
{
in: []Postings{b, a, EmptyPostings()},
res: EmptyPostings(),
},
{
in: []Postings{EmptyPostings(), b, a},
res: EmptyPostings(),
},
{
in: []Postings{EmptyPostings(), a, b},
res: EmptyPostings(),
},
{
in: []Postings{a, EmptyPostings(), b},
res: EmptyPostings(),
},
{
in: []Postings{b, EmptyPostings(), a},
res: EmptyPostings(),
},
{
in: []Postings{b, EmptyPostings(), a, a, b, a, a, a},
res: EmptyPostings(),
},
{
in: []Postings{
newListPostings(1, 2, 3, 4, 5),
newListPostings(6, 7, 8, 9, 10),
},
res: newListPostings(),
},
{
in: []Postings{
newListPostings(1, 2, 3, 4, 5),
newListPostings(4, 5, 6, 7, 8),
},
res: newListPostings(4, 5),
},
{
in: []Postings{
newListPostings(1, 2, 3, 4, 9, 10),
newListPostings(1, 4, 5, 6, 7, 8, 10, 11),
},
res: newListPostings(1, 4, 10),
},
{
in: []Postings{
newListPostings(1),
newListPostings(0, 1),
},
res: newListPostings(1),
},
{
in: []Postings{
newListPostings(1),
},
res: newListPostings(1),
},
{
in: []Postings{
newListPostings(1),
newListPostings(),
},
res: newListPostings(),
},
{
in: []Postings{
newListPostings(),
newListPostings(),
},
res: newListPostings(),
},
}
for _, c := range cases {
t.Run("", func(t *testing.T) {
if c.res == nil {
t.Fatal("intersect result expectancy cannot be nil")
}
expected, err := ExpandPostings(c.res)
testutil.Ok(t, err)
i := Intersect(c.in...)
if c.res == EmptyPostings() {
testutil.Equals(t, EmptyPostings(), i)
return
}
if i == EmptyPostings() {
t.Fatal("intersect unexpected result: EmptyPostings sentinel")
}
res, err := ExpandPostings(i)
testutil.Ok(t, err)
testutil.Equals(t, expected, res)
})
}
}
func TestMultiIntersect(t *testing.T) {
var cases = []struct {
p [][]uint64
res []uint64
}{
{
p: [][]uint64{
{1, 2, 3, 4, 5, 6, 1000, 1001},
{2, 4, 5, 6, 7, 8, 999, 1001},
{1, 2, 5, 6, 7, 8, 1001, 1200},
},
res: []uint64{2, 5, 6, 1001},
},
// One of the reproduceable cases for:
// https://github.com/prometheus/prometheus/issues/2616
// The initialisation of intersectPostings was moving the iterator forward
// prematurely making us miss some postings.
{
p: [][]uint64{
{1, 2},
{1, 2},
{1, 2},
{2},
},
res: []uint64{2},
},
}
for _, c := range cases {
ps := make([]Postings, 0, len(c.p))
for _, postings := range c.p {
ps = append(ps, newListPostings(postings...))
}
res, err := ExpandPostings(Intersect(ps...))
testutil.Ok(t, err)
testutil.Equals(t, c.res, res)
}
}
func BenchmarkIntersect(t *testing.B) {
t.Run("LongPostings1", func(bench *testing.B) {
var a, b, c, d []uint64
for i := 0; i < 10000000; i += 2 {
a = append(a, uint64(i))
}
for i := 5000000; i < 5000100; i += 4 {
b = append(b, uint64(i))
}
for i := 5090000; i < 5090600; i += 4 {
b = append(b, uint64(i))
}
for i := 4990000; i < 5100000; i++ {
c = append(c, uint64(i))
}
for i := 4000000; i < 6000000; i++ {
d = append(d, uint64(i))
}
i1 := newListPostings(a...)
i2 := newListPostings(b...)
i3 := newListPostings(c...)
i4 := newListPostings(d...)
bench.ResetTimer()
bench.ReportAllocs()
for i := 0; i < bench.N; i++ {
if _, err := ExpandPostings(Intersect(i1, i2, i3, i4)); err != nil {
bench.Fatal(err)
}
}
})
t.Run("LongPostings2", func(bench *testing.B) {
var a, b, c, d []uint64
for i := 0; i < 12500000; i++ {
a = append(a, uint64(i))
}
for i := 7500000; i < 12500000; i++ {
b = append(b, uint64(i))
}
for i := 9000000; i < 20000000; i++ {
c = append(c, uint64(i))
}
for i := 10000000; i < 12000000; i++ {
d = append(d, uint64(i))
}
i1 := newListPostings(a...)
i2 := newListPostings(b...)
i3 := newListPostings(c...)
i4 := newListPostings(d...)
bench.ResetTimer()
bench.ReportAllocs()
for i := 0; i < bench.N; i++ {
if _, err := ExpandPostings(Intersect(i1, i2, i3, i4)); err != nil {
bench.Fatal(err)
}
}
})
// Many matchers(k >> n).
t.Run("ManyPostings", func(bench *testing.B) {
var its []Postings
// 100000 matchers(k=100000).
for i := 0; i < 100000; i++ {
var temp []uint64
for j := 1; j < 100; j++ {
temp = append(temp, uint64(j))
}
its = append(its, newListPostings(temp...))
}
bench.ResetTimer()
bench.ReportAllocs()
for i := 0; i < bench.N; i++ {
if _, err := ExpandPostings(Intersect(its...)); err != nil {
bench.Fatal(err)
}
}
})
}
func TestMultiMerge(t *testing.T) {
i1 := newListPostings(1, 2, 3, 4, 5, 6, 1000, 1001)
i2 := newListPostings(2, 4, 5, 6, 7, 8, 999, 1001)
i3 := newListPostings(1, 2, 5, 6, 7, 8, 1001, 1200)
res, err := ExpandPostings(Merge(i1, i2, i3))
testutil.Ok(t, err)
testutil.Equals(t, []uint64{1, 2, 3, 4, 5, 6, 7, 8, 999, 1000, 1001, 1200}, res)
}
func TestMergedPostings(t *testing.T) {
var cases = []struct {
in []Postings
res Postings
}{
{
in: []Postings{},
res: EmptyPostings(),
},
{
in: []Postings{
newListPostings(),
newListPostings(),
},
res: EmptyPostings(),
},
{
in: []Postings{
newListPostings(),
},
res: newListPostings(),
},
{
in: []Postings{
EmptyPostings(),
EmptyPostings(),
EmptyPostings(),
EmptyPostings(),
},
res: EmptyPostings(),
},
{
in: []Postings{
newListPostings(1, 2, 3, 4, 5),
newListPostings(6, 7, 8, 9, 10),
},
res: newListPostings(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
},
{
in: []Postings{
newListPostings(1, 2, 3, 4, 5),
newListPostings(4, 5, 6, 7, 8),
},
res: newListPostings(1, 2, 3, 4, 5, 6, 7, 8),
},
{
in: []Postings{
newListPostings(1, 2, 3, 4, 9, 10),
newListPostings(1, 4, 5, 6, 7, 8, 10, 11),
},
res: newListPostings(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
},
{
in: []Postings{
newListPostings(1, 2, 3, 4, 9, 10),
EmptyPostings(),
newListPostings(1, 4, 5, 6, 7, 8, 10, 11),
},
res: newListPostings(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
},
{
in: []Postings{
newListPostings(1, 2),
newListPostings(),
},
res: newListPostings(1, 2),
},
{
in: []Postings{
newListPostings(1, 2),
EmptyPostings(),
},
res: newListPostings(1, 2),
},
}
for _, c := range cases {
t.Run("", func(t *testing.T) {
if c.res == nil {
t.Fatal("merge result expectancy cannot be nil")
}
expected, err := ExpandPostings(c.res)
testutil.Ok(t, err)
m := Merge(c.in...)
if c.res == EmptyPostings() {
testutil.Equals(t, EmptyPostings(), m)
return
}
if m == EmptyPostings() {
t.Fatal("merge unexpected result: EmptyPostings sentinel")
}
res, err := ExpandPostings(m)
testutil.Ok(t, err)
testutil.Equals(t, expected, res)
})
}
}
func TestMergedPostingsSeek(t *testing.T) {
var cases = []struct {
a, b []uint64
seek uint64
success bool
res []uint64
}{
{
a: []uint64{2, 3, 4, 5},
b: []uint64{6, 7, 8, 9, 10},
seek: 1,
success: true,
res: []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10},
},
{
a: []uint64{1, 2, 3, 4, 5},
b: []uint64{6, 7, 8, 9, 10},
seek: 2,
success: true,
res: []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10},
},
{
a: []uint64{1, 2, 3, 4, 5},
b: []uint64{4, 5, 6, 7, 8},
seek: 9,
success: false,
res: nil,
},
{
a: []uint64{1, 2, 3, 4, 9, 10},
b: []uint64{1, 4, 5, 6, 7, 8, 10, 11},
seek: 10,
success: true,
res: []uint64{10, 11},
},
}
for _, c := range cases {
a := newListPostings(c.a...)
b := newListPostings(c.b...)
p := Merge(a, b)
testutil.Equals(t, c.success, p.Seek(c.seek))
// After Seek(), At() should be called.
if c.success {
start := p.At()
lst, err := ExpandPostings(p)
testutil.Ok(t, err)
lst = append([]uint64{start}, lst...)
testutil.Equals(t, c.res, lst)
}
}
}
func TestRemovedPostings(t *testing.T) {
var cases = []struct {
a, b []uint64
res []uint64
}{
{
a: nil,
b: nil,
res: []uint64(nil),
},
{
a: []uint64{1, 2, 3, 4},
b: nil,
res: []uint64{1, 2, 3, 4},
},
{
a: nil,
b: []uint64{1, 2, 3, 4},
res: []uint64(nil),
},
{
a: []uint64{1, 2, 3, 4, 5},
b: []uint64{6, 7, 8, 9, 10},
res: []uint64{1, 2, 3, 4, 5},
},
{
a: []uint64{1, 2, 3, 4, 5},
b: []uint64{4, 5, 6, 7, 8},
res: []uint64{1, 2, 3},
},
{
a: []uint64{1, 2, 3, 4, 9, 10},
b: []uint64{1, 4, 5, 6, 7, 8, 10, 11},
res: []uint64{2, 3, 9},
},
{
a: []uint64{1, 2, 3, 4, 9, 10},
b: []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
res: []uint64(nil),
},
}
for _, c := range cases {
a := newListPostings(c.a...)
b := newListPostings(c.b...)
res, err := ExpandPostings(newRemovedPostings(a, b))
testutil.Ok(t, err)
testutil.Equals(t, c.res, res)
}
}
func TestRemovedNextStackoverflow(t *testing.T) {
var full []uint64
var remove []uint64
var i uint64
for i = 0; i < 1e7; i++ {
full = append(full, i)
remove = append(remove, i)
}
flp := newListPostings(full...)
rlp := newListPostings(remove...)
rp := newRemovedPostings(flp, rlp)
gotElem := false
for rp.Next() {
gotElem = true
}
testutil.Ok(t, rp.Err())
testutil.Assert(t, !gotElem, "")
}
func TestRemovedPostingsSeek(t *testing.T) {
var cases = []struct {
a, b []uint64
seek uint64
success bool
res []uint64
}{
{
a: []uint64{2, 3, 4, 5},
b: []uint64{6, 7, 8, 9, 10},
seek: 1,
success: true,
res: []uint64{2, 3, 4, 5},
},
{
a: []uint64{1, 2, 3, 4, 5},
b: []uint64{6, 7, 8, 9, 10},
seek: 2,
success: true,
res: []uint64{2, 3, 4, 5},
},
{
a: []uint64{1, 2, 3, 4, 5},
b: []uint64{4, 5, 6, 7, 8},
seek: 9,
success: false,
res: nil,
},
{
a: []uint64{1, 2, 3, 4, 9, 10},
b: []uint64{1, 4, 5, 6, 7, 8, 10, 11},
seek: 10,
success: false,
res: nil,
},
{
a: []uint64{1, 2, 3, 4, 9, 10},
b: []uint64{1, 4, 5, 6, 7, 8, 11},
seek: 4,
success: true,
res: []uint64{9, 10},
},
{
a: []uint64{1, 2, 3, 4, 9, 10},
b: []uint64{1, 4, 5, 6, 7, 8, 11},
seek: 5,
success: true,
res: []uint64{9, 10},
},
{
a: []uint64{1, 2, 3, 4, 9, 10},
b: []uint64{1, 4, 5, 6, 7, 8, 11},
seek: 10,
success: true,
res: []uint64{10},
},
}
for _, c := range cases {
a := newListPostings(c.a...)
b := newListPostings(c.b...)
p := newRemovedPostings(a, b)
testutil.Equals(t, c.success, p.Seek(c.seek))
// After Seek(), At() should be called.
if c.success {
start := p.At()
lst, err := ExpandPostings(p)
testutil.Ok(t, err)
lst = append([]uint64{start}, lst...)
testutil.Equals(t, c.res, lst)
}
}
}
func TestBigEndian(t *testing.T) {
num := 1000
// mock a list as postings
ls := make([]uint32, num)
ls[0] = 2
for i := 1; i < num; i++ {
ls[i] = ls[i-1] + uint32(rand.Int31n(25)) + 2
}
beLst := make([]byte, num*4)
for i := 0; i < num; i++ {
b := beLst[i*4 : i*4+4]
binary.BigEndian.PutUint32(b, ls[i])
}
t.Run("Iteration", func(t *testing.T) {
bep := newBigEndianPostings(beLst)
for i := 0; i < num; i++ {
testutil.Assert(t, bep.Next() == true, "")
testutil.Equals(t, uint64(ls[i]), bep.At())
}
testutil.Assert(t, bep.Next() == false, "")
testutil.Assert(t, bep.Err() == nil, "")
})
t.Run("Seek", func(t *testing.T) {
table := []struct {
seek uint32
val uint32
found bool
}{
{
ls[0] - 1, ls[0], true,
},
{
ls[4], ls[4], true,
},
{
ls[500] - 1, ls[500], true,
},
{
ls[600] + 1, ls[601], true,
},
{
ls[600] + 1, ls[601], true,
},
{
ls[600] + 1, ls[601], true,
},
{
ls[0], ls[601], true,
},
{
ls[600], ls[601], true,
},
{
ls[999], ls[999], true,
},
{
ls[999] + 10, ls[999], false,
},
}
bep := newBigEndianPostings(beLst)
for _, v := range table {
testutil.Equals(t, v.found, bep.Seek(uint64(v.seek)))
testutil.Equals(t, uint64(v.val), bep.At())
testutil.Assert(t, bep.Err() == nil, "")
}
})
}
func TestIntersectWithMerge(t *testing.T) {
// One of the reproducible cases for:
// https://github.com/prometheus/prometheus/issues/2616
a := newListPostings(21, 22, 23, 24, 25, 30)
b := Merge(
newListPostings(10, 20, 30),
newListPostings(15, 26, 30),
)
p := Intersect(a, b)
res, err := ExpandPostings(p)
testutil.Ok(t, err)
testutil.Equals(t, []uint64{30}, res)
}
func TestWithoutPostings(t *testing.T) {
var cases = []struct {
base Postings
drop Postings
res Postings
}{
{
base: EmptyPostings(),
drop: EmptyPostings(),
res: EmptyPostings(),
},
{
base: EmptyPostings(),
drop: newListPostings(1, 2),
res: EmptyPostings(),
},
{
base: newListPostings(1, 2),
drop: EmptyPostings(),
res: newListPostings(1, 2),
},
{
base: newListPostings(),
drop: newListPostings(),
res: newListPostings(),
},
{
base: newListPostings(1, 2, 3),
drop: newListPostings(),
res: newListPostings(1, 2, 3),
},
{
base: newListPostings(1, 2, 3),
drop: newListPostings(4, 5, 6),
res: newListPostings(1, 2, 3),
},
{
base: newListPostings(1, 2, 3),
drop: newListPostings(3, 4, 5),
res: newListPostings(1, 2),
},
}
for _, c := range cases {
t.Run("", func(t *testing.T) {
if c.res == nil {
t.Fatal("without result expectancy cannot be nil")
}
expected, err := ExpandPostings(c.res)
testutil.Ok(t, err)
w := Without(c.base, c.drop)
if c.res == EmptyPostings() {
testutil.Equals(t, EmptyPostings(), w)
return
}
if w == EmptyPostings() {
t.Fatal("without unexpected result: EmptyPostings sentinel")
}
res, err := ExpandPostings(w)
testutil.Ok(t, err)
testutil.Equals(t, expected, res)
})
}
}

233
tsdb/labels/labels.go Normal file
View file

@ -0,0 +1,233 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package labels
import (
"bufio"
"bytes"
"os"
"sort"
"strconv"
"strings"
"github.com/cespare/xxhash"
"github.com/pkg/errors"
)
const sep = '\xff'
// Label is a key/value pair of strings.
type Label struct {
Name, Value string
}
// Labels is a sorted set of labels. Order has to be guaranteed upon
// instantiation.
type Labels []Label
func (ls Labels) Len() int { return len(ls) }
func (ls Labels) Swap(i, j int) { ls[i], ls[j] = ls[j], ls[i] }
func (ls Labels) Less(i, j int) bool { return ls[i].Name < ls[j].Name }
func (ls Labels) String() string {
var b bytes.Buffer
b.WriteByte('{')
for i, l := range ls {
if i > 0 {
b.WriteByte(',')
}
b.WriteString(l.Name)
b.WriteByte('=')
b.WriteString(strconv.Quote(l.Value))
}
b.WriteByte('}')
return b.String()
}
// Hash returns a hash value for the label set.
func (ls Labels) Hash() uint64 {
b := make([]byte, 0, 1024)
for _, v := range ls {
b = append(b, v.Name...)
b = append(b, sep)
b = append(b, v.Value...)
b = append(b, sep)
}
return xxhash.Sum64(b)
}
// Get returns the value for the label with the given name.
// Returns an empty string if the label doesn't exist.
func (ls Labels) Get(name string) string {
for _, l := range ls {
if l.Name == name {
return l.Value
}
}
return ""
}
// Equals returns whether the two label sets are equal.
func (ls Labels) Equals(o Labels) bool {
if len(ls) != len(o) {
return false
}
for i, l := range ls {
if o[i] != l {
return false
}
}
return true
}
// Map returns a string map of the labels.
func (ls Labels) Map() map[string]string {
m := make(map[string]string, len(ls))
for _, l := range ls {
m[l.Name] = l.Value
}
return m
}
// WithoutEmpty returns the labelset without empty labels.
// May return the same labelset.
func (ls Labels) WithoutEmpty() Labels {
for _, v := range ls {
if v.Value == "" {
els := make(Labels, 0, len(ls)-1)
for _, v := range ls {
if v.Value != "" {
els = append(els, v)
}
}
return els
}
}
return ls
}
// New returns a sorted Labels from the given labels.
// The caller has to guarantee that all label names are unique.
func New(ls ...Label) Labels {
set := make(Labels, 0, len(ls))
for _, l := range ls {
set = append(set, l)
}
sort.Sort(set)
return set
}
// FromMap returns new sorted Labels from the given map.
func FromMap(m map[string]string) Labels {
l := make(Labels, 0, len(m))
for k, v := range m {
if v != "" {
l = append(l, Label{Name: k, Value: v})
}
}
sort.Sort(l)
return l
}
// FromStrings creates new labels from pairs of strings.
func FromStrings(ss ...string) Labels {
if len(ss)%2 != 0 {
panic("invalid number of strings")
}
var res Labels
for i := 0; i < len(ss); i += 2 {
if ss[i+1] != "" {
res = append(res, Label{Name: ss[i], Value: ss[i+1]})
}
}
sort.Sort(res)
return res
}
// Compare compares the two label sets.
// The result will be 0 if a==b, <0 if a < b, and >0 if a > b.
func Compare(a, b Labels) int {
l := len(a)
if len(b) < l {
l = len(b)
}
for i := 0; i < l; i++ {
if d := strings.Compare(a[i].Name, b[i].Name); d != 0 {
return d
}
if d := strings.Compare(a[i].Value, b[i].Value); d != 0 {
return d
}
}
// If all labels so far were in common, the set with fewer labels comes first.
return len(a) - len(b)
}
// Slice is a sortable slice of label sets.
type Slice []Labels
func (s Slice) Len() int { return len(s) }
func (s Slice) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s Slice) Less(i, j int) bool { return Compare(s[i], s[j]) < 0 }
// ReadLabels reads up to n label sets in a JSON formatted file fn. It is mostly useful
// to load testing data.
func ReadLabels(fn string, n int) ([]Labels, error) {
f, err := os.Open(fn)
if err != nil {
return nil, err
}
defer f.Close()
scanner := bufio.NewScanner(f)
var mets []Labels
hashes := map[uint64]struct{}{}
i := 0
for scanner.Scan() && i < n {
m := make(Labels, 0, 10)
r := strings.NewReplacer("\"", "", "{", "", "}", "")
s := r.Replace(scanner.Text())
labelChunks := strings.Split(s, ",")
for _, labelChunk := range labelChunks {
split := strings.Split(labelChunk, ":")
m = append(m, Label{Name: split[0], Value: split[1]})
}
// Order of the k/v labels matters, don't assume we'll always receive them already sorted.
sort.Sort(m)
h := m.Hash()
if _, ok := hashes[h]; ok {
continue
}
mets = append(mets, m)
hashes[h] = struct{}{}
i++
}
if i != n {
return mets, errors.Errorf("requested %d metrics but found %d", n, i)
}
return mets, nil
}

199
tsdb/labels/labels_test.go Normal file
View file

@ -0,0 +1,199 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package labels
import (
"fmt"
"math/rand"
"path/filepath"
"sort"
"testing"
"github.com/prometheus/tsdb/testutil"
)
func TestCompareAndEquals(t *testing.T) {
cases := []struct {
a, b []Label
res int
}{
{
a: []Label{},
b: []Label{},
res: 0,
},
{
a: []Label{{"a", ""}},
b: []Label{{"a", ""}, {"b", ""}},
res: -1,
},
{
a: []Label{{"a", ""}},
b: []Label{{"a", ""}},
res: 0,
},
{
a: []Label{{"aa", ""}, {"aa", ""}},
b: []Label{{"aa", ""}, {"ab", ""}},
res: -1,
},
{
a: []Label{{"aa", ""}, {"abb", ""}},
b: []Label{{"aa", ""}, {"ab", ""}},
res: 1,
},
{
a: []Label{
{"__name__", "go_gc_duration_seconds"},
{"job", "prometheus"},
{"quantile", "0.75"},
},
b: []Label{
{"__name__", "go_gc_duration_seconds"},
{"job", "prometheus"},
{"quantile", "1"},
},
res: -1,
},
{
a: []Label{
{"handler", "prometheus"},
{"instance", "localhost:9090"},
},
b: []Label{
{"handler", "query"},
{"instance", "localhost:9090"},
},
res: -1,
},
}
for _, c := range cases {
// Use constructor to ensure sortedness.
a, b := New(c.a...), New(c.b...)
testutil.Equals(t, c.res, Compare(a, b))
testutil.Equals(t, c.res == 0, a.Equals(b))
}
}
func BenchmarkSliceSort(b *testing.B) {
lbls, err := ReadLabels(filepath.Join("..", "testdata", "20kseries.json"), 20000)
testutil.Ok(b, err)
for len(lbls) < 20e6 {
lbls = append(lbls, lbls...)
}
for i := range lbls {
j := rand.Intn(i + 1)
lbls[i], lbls[j] = lbls[j], lbls[i]
}
for _, k := range []int{
100, 5000, 50000, 300000, 900000, 5e6, 20e6,
} {
b.Run(fmt.Sprintf("%d", k), func(b *testing.B) {
b.ReportAllocs()
for a := 0; a < b.N; a++ {
b.StopTimer()
cl := make(Slice, k)
copy(cl, Slice(lbls[:k]))
b.StartTimer()
sort.Sort(cl)
}
})
}
}
func BenchmarkLabelSetFromMap(b *testing.B) {
m := map[string]string{
"job": "node",
"instance": "123.123.1.211:9090",
"path": "/api/v1/namespaces/<namespace>/deployments/<name>",
"method": "GET",
"namespace": "system",
"status": "500",
}
var ls Labels
b.ReportAllocs()
for i := 0; i < b.N; i++ {
ls = FromMap(m)
}
_ = ls
}
func BenchmarkMapFromLabels(b *testing.B) {
m := map[string]string{
"job": "node",
"instance": "123.123.1.211:9090",
"path": "/api/v1/namespaces/<namespace>/deployments/<name>",
"method": "GET",
"namespace": "system",
"status": "500",
}
ls := FromMap(m)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = ls.Map()
}
}
func BenchmarkLabelSetEquals(b *testing.B) {
// The vast majority of comparisons will be against a matching label set.
m := map[string]string{
"job": "node",
"instance": "123.123.1.211:9090",
"path": "/api/v1/namespaces/<namespace>/deployments/<name>",
"method": "GET",
"namespace": "system",
"status": "500",
}
ls := FromMap(m)
var res bool
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
res = ls.Equals(ls)
}
_ = res
}
func BenchmarkLabelSetHash(b *testing.B) {
// The vast majority of comparisons will be against a matching label set.
m := map[string]string{
"job": "node",
"instance": "123.123.1.211:9090",
"path": "/api/v1/namespaces/<namespace>/deployments/<name>",
"method": "GET",
"namespace": "system",
"status": "500",
}
ls := FromMap(m)
var res uint64
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
res += ls.Hash()
}
fmt.Println(res)
}

109
tsdb/labels/selector.go Normal file
View file

@ -0,0 +1,109 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package labels
import (
"fmt"
"regexp"
)
// Selector holds constraints for matching against a label set.
type Selector []Matcher
// Matches returns whether the labels satisfy all matchers.
func (s Selector) Matches(labels Labels) bool {
for _, m := range s {
if v := labels.Get(m.Name()); !m.Matches(v) {
return false
}
}
return true
}
// Matcher specifies a constraint for the value of a label.
type Matcher interface {
// Name returns the label name the matcher should apply to.
Name() string
// Matches checks whether a value fulfills the constraints.
Matches(v string) bool
// String returns a human readable matcher.
String() string
}
// EqualMatcher matches on equality.
type EqualMatcher struct {
name, value string
}
// Name implements Matcher interface.
func (m EqualMatcher) Name() string { return m.name }
// Matches implements Matcher interface.
func (m EqualMatcher) Matches(v string) bool { return v == m.value }
// String implements Matcher interface.
func (m EqualMatcher) String() string { return fmt.Sprintf("%s=%q", m.name, m.value) }
// Value returns the matched value.
func (m EqualMatcher) Value() string { return m.value }
// NewEqualMatcher returns a new matcher matching an exact label value.
func NewEqualMatcher(name, value string) Matcher {
return &EqualMatcher{name: name, value: value}
}
type RegexpMatcher struct {
name string
re *regexp.Regexp
}
func (m RegexpMatcher) Name() string { return m.name }
func (m RegexpMatcher) Matches(v string) bool { return m.re.MatchString(v) }
func (m RegexpMatcher) String() string { return fmt.Sprintf("%s=~%q", m.name, m.re.String()) }
func (m RegexpMatcher) Value() string { return m.re.String() }
// NewRegexpMatcher returns a new matcher verifying that a value matches
// the regular expression pattern.
func NewRegexpMatcher(name, pattern string) (Matcher, error) {
re, err := regexp.Compile(pattern)
if err != nil {
return nil, err
}
return &RegexpMatcher{name: name, re: re}, nil
}
// NewMustRegexpMatcher returns a new matcher verifying that a value matches
// the regular expression pattern. Will panic if the pattern is not a valid
// regular expression.
func NewMustRegexpMatcher(name, pattern string) Matcher {
re, err := regexp.Compile(pattern)
if err != nil {
panic(err)
}
return &RegexpMatcher{name: name, re: re}
}
// NotMatcher inverts the matching result for a matcher.
type NotMatcher struct {
Matcher
}
func (m NotMatcher) Matches(v string) bool { return !m.Matcher.Matches(v) }
func (m NotMatcher) String() string { return fmt.Sprintf("not(%s)", m.Matcher.String()) }
// Not inverts the matcher's matching result.
func Not(m Matcher) Matcher {
return &NotMatcher{m}
}

78
tsdb/mocks_test.go Normal file
View file

@ -0,0 +1,78 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"github.com/prometheus/tsdb/chunkenc"
"github.com/prometheus/tsdb/chunks"
"github.com/prometheus/tsdb/index"
"github.com/prometheus/tsdb/labels"
)
type mockIndexWriter struct {
series []seriesSamples
}
func (mockIndexWriter) AddSymbols(sym map[string]struct{}) error { return nil }
func (m *mockIndexWriter) AddSeries(ref uint64, l labels.Labels, chunks ...chunks.Meta) error {
i := -1
for j, s := range m.series {
if !labels.FromMap(s.lset).Equals(l) {
continue
}
i = j
break
}
if i == -1 {
m.series = append(m.series, seriesSamples{
lset: l.Map(),
})
i = len(m.series) - 1
}
var iter chunkenc.Iterator
for _, chk := range chunks {
samples := make([]sample, 0, chk.Chunk.NumSamples())
iter = chk.Chunk.Iterator(iter)
for iter.Next() {
s := sample{}
s.t, s.v = iter.At()
samples = append(samples, s)
}
if err := iter.Err(); err != nil {
return err
}
m.series[i].chunks = append(m.series[i].chunks, samples)
}
return nil
}
func (mockIndexWriter) WriteLabelIndex(names []string, values []string) error { return nil }
func (mockIndexWriter) WritePostings(name, value string, it index.Postings) error { return nil }
func (mockIndexWriter) Close() error { return nil }
type mockBReader struct {
ir IndexReader
cr ChunkReader
mint int64
maxt int64
}
func (r *mockBReader) Index() (IndexReader, error) { return r.ir, nil }
func (r *mockBReader) Chunks() (ChunkReader, error) { return r.cr, nil }
func (r *mockBReader) Tombstones() (TombstoneReader, error) { return newMemTombstones(), nil }
func (r *mockBReader) Meta() BlockMeta { return BlockMeta{MinTime: r.mint, MaxTime: r.maxt} }

1212
tsdb/querier.go Normal file

File diff suppressed because it is too large Load diff

2229
tsdb/querier_test.go Normal file

File diff suppressed because it is too large Load diff

208
tsdb/record.go Normal file
View file

@ -0,0 +1,208 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"math"
"sort"
"github.com/pkg/errors"
"github.com/prometheus/tsdb/encoding"
"github.com/prometheus/tsdb/labels"
)
// RecordType represents the data type of a record.
type RecordType uint8
const (
// RecordInvalid is returned for unrecognised WAL record types.
RecordInvalid RecordType = 255
// RecordSeries is used to match WAL records of type Series.
RecordSeries RecordType = 1
// RecordSamples is used to match WAL records of type Samples.
RecordSamples RecordType = 2
// RecordTombstones is used to match WAL records of type Tombstones.
RecordTombstones RecordType = 3
)
// RecordDecoder decodes series, sample, and tombstone records.
// The zero value is ready to use.
type RecordDecoder struct {
}
// Type returns the type of the record.
// Return RecordInvalid if no valid record type is found.
func (d *RecordDecoder) Type(rec []byte) RecordType {
if len(rec) < 1 {
return RecordInvalid
}
switch t := RecordType(rec[0]); t {
case RecordSeries, RecordSamples, RecordTombstones:
return t
}
return RecordInvalid
}
// Series appends series in rec to the given slice.
func (d *RecordDecoder) Series(rec []byte, series []RefSeries) ([]RefSeries, error) {
dec := encoding.Decbuf{B: rec}
if RecordType(dec.Byte()) != RecordSeries {
return nil, errors.New("invalid record type")
}
for len(dec.B) > 0 && dec.Err() == nil {
ref := dec.Be64()
lset := make(labels.Labels, dec.Uvarint())
for i := range lset {
lset[i].Name = dec.UvarintStr()
lset[i].Value = dec.UvarintStr()
}
sort.Sort(lset)
series = append(series, RefSeries{
Ref: ref,
Labels: lset,
})
}
if dec.Err() != nil {
return nil, dec.Err()
}
if len(dec.B) > 0 {
return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.B))
}
return series, nil
}
// Samples appends samples in rec to the given slice.
func (d *RecordDecoder) Samples(rec []byte, samples []RefSample) ([]RefSample, error) {
dec := encoding.Decbuf{B: rec}
if RecordType(dec.Byte()) != RecordSamples {
return nil, errors.New("invalid record type")
}
if dec.Len() == 0 {
return samples, nil
}
var (
baseRef = dec.Be64()
baseTime = dec.Be64int64()
)
for len(dec.B) > 0 && dec.Err() == nil {
dref := dec.Varint64()
dtime := dec.Varint64()
val := dec.Be64()
samples = append(samples, RefSample{
Ref: uint64(int64(baseRef) + dref),
T: baseTime + dtime,
V: math.Float64frombits(val),
})
}
if dec.Err() != nil {
return nil, errors.Wrapf(dec.Err(), "decode error after %d samples", len(samples))
}
if len(dec.B) > 0 {
return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.B))
}
return samples, nil
}
// Tombstones appends tombstones in rec to the given slice.
func (d *RecordDecoder) Tombstones(rec []byte, tstones []Stone) ([]Stone, error) {
dec := encoding.Decbuf{B: rec}
if RecordType(dec.Byte()) != RecordTombstones {
return nil, errors.New("invalid record type")
}
for dec.Len() > 0 && dec.Err() == nil {
tstones = append(tstones, Stone{
ref: dec.Be64(),
intervals: Intervals{
{Mint: dec.Varint64(), Maxt: dec.Varint64()},
},
})
}
if dec.Err() != nil {
return nil, dec.Err()
}
if len(dec.B) > 0 {
return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.B))
}
return tstones, nil
}
// RecordEncoder encodes series, sample, and tombstones records.
// The zero value is ready to use.
type RecordEncoder struct {
}
// Series appends the encoded series to b and returns the resulting slice.
func (e *RecordEncoder) Series(series []RefSeries, b []byte) []byte {
buf := encoding.Encbuf{B: b}
buf.PutByte(byte(RecordSeries))
for _, s := range series {
buf.PutBE64(s.Ref)
buf.PutUvarint(len(s.Labels))
for _, l := range s.Labels {
buf.PutUvarintStr(l.Name)
buf.PutUvarintStr(l.Value)
}
}
return buf.Get()
}
// Samples appends the encoded samples to b and returns the resulting slice.
func (e *RecordEncoder) Samples(samples []RefSample, b []byte) []byte {
buf := encoding.Encbuf{B: b}
buf.PutByte(byte(RecordSamples))
if len(samples) == 0 {
return buf.Get()
}
// Store base timestamp and base reference number of first sample.
// All samples encode their timestamp and ref as delta to those.
first := samples[0]
buf.PutBE64(first.Ref)
buf.PutBE64int64(first.T)
for _, s := range samples {
buf.PutVarint64(int64(s.Ref) - int64(first.Ref))
buf.PutVarint64(s.T - first.T)
buf.PutBE64(math.Float64bits(s.V))
}
return buf.Get()
}
// Tombstones appends the encoded tombstones to b and returns the resulting slice.
func (e *RecordEncoder) Tombstones(tstones []Stone, b []byte) []byte {
buf := encoding.Encbuf{B: b}
buf.PutByte(byte(RecordTombstones))
for _, s := range tstones {
for _, iv := range s.intervals {
buf.PutBE64(s.ref)
buf.PutVarint64(iv.Mint)
buf.PutVarint64(iv.Maxt)
}
}
return buf.Get()
}

118
tsdb/record_test.go Normal file
View file

@ -0,0 +1,118 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"testing"
"github.com/pkg/errors"
"github.com/prometheus/tsdb/encoding"
"github.com/prometheus/tsdb/labels"
"github.com/prometheus/tsdb/testutil"
)
func TestRecord_EncodeDecode(t *testing.T) {
var enc RecordEncoder
var dec RecordDecoder
series := []RefSeries{
{
Ref: 100,
Labels: labels.FromStrings("abc", "def", "123", "456"),
}, {
Ref: 1,
Labels: labels.FromStrings("abc", "def2", "1234", "4567"),
}, {
Ref: 435245,
Labels: labels.FromStrings("xyz", "def", "foo", "bar"),
},
}
decSeries, err := dec.Series(enc.Series(series, nil), nil)
testutil.Ok(t, err)
testutil.Equals(t, series, decSeries)
samples := []RefSample{
{Ref: 0, T: 12423423, V: 1.2345},
{Ref: 123, T: -1231, V: -123},
{Ref: 2, T: 0, V: 99999},
}
decSamples, err := dec.Samples(enc.Samples(samples, nil), nil)
testutil.Ok(t, err)
testutil.Equals(t, samples, decSamples)
// Intervals get split up into single entries. So we don't get back exactly
// what we put in.
tstones := []Stone{
{ref: 123, intervals: Intervals{
{Mint: -1000, Maxt: 1231231},
{Mint: 5000, Maxt: 0},
}},
{ref: 13, intervals: Intervals{
{Mint: -1000, Maxt: -11},
{Mint: 5000, Maxt: 1000},
}},
}
decTstones, err := dec.Tombstones(enc.Tombstones(tstones, nil), nil)
testutil.Ok(t, err)
testutil.Equals(t, []Stone{
{ref: 123, intervals: Intervals{{Mint: -1000, Maxt: 1231231}}},
{ref: 123, intervals: Intervals{{Mint: 5000, Maxt: 0}}},
{ref: 13, intervals: Intervals{{Mint: -1000, Maxt: -11}}},
{ref: 13, intervals: Intervals{{Mint: 5000, Maxt: 1000}}},
}, decTstones)
}
// TestRecord_Corruputed ensures that corrupted records return the correct error.
// Bugfix check for pull/521 and pull/523.
func TestRecord_Corruputed(t *testing.T) {
var enc RecordEncoder
var dec RecordDecoder
t.Run("Test corrupted series record", func(t *testing.T) {
series := []RefSeries{
{
Ref: 100,
Labels: labels.FromStrings("abc", "def", "123", "456"),
},
}
corrupted := enc.Series(series, nil)[:8]
_, err := dec.Series(corrupted, nil)
testutil.Equals(t, err, encoding.ErrInvalidSize)
})
t.Run("Test corrupted sample record", func(t *testing.T) {
samples := []RefSample{
{Ref: 0, T: 12423423, V: 1.2345},
}
corrupted := enc.Samples(samples, nil)[:8]
_, err := dec.Samples(corrupted, nil)
testutil.Equals(t, errors.Cause(err), encoding.ErrInvalidSize)
})
t.Run("Test corrupted tombstone record", func(t *testing.T) {
tstones := []Stone{
{ref: 123, intervals: Intervals{
{Mint: -1000, Maxt: 1231231},
{Mint: 5000, Maxt: 0},
}},
}
corrupted := enc.Tombstones(tstones, nil)[:8]
_, err := dec.Tombstones(corrupted, nil)
testutil.Equals(t, err, encoding.ErrInvalidSize)
})
}

133
tsdb/repair.go Normal file
View file

@ -0,0 +1,133 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"encoding/json"
"io"
"io/ioutil"
"os"
"path/filepath"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/pkg/errors"
tsdb_errors "github.com/prometheus/tsdb/errors"
"github.com/prometheus/tsdb/fileutil"
)
// repairBadIndexVersion repairs an issue in index and meta.json persistence introduced in
// commit 129773b41a565fde5156301e37f9a87158030443.
func repairBadIndexVersion(logger log.Logger, dir string) error {
// All blocks written by Prometheus 2.1 with a meta.json version of 2 are affected.
// We must actually set the index file version to 2 and revert the meta.json version back to 1.
dirs, err := blockDirs(dir)
if err != nil {
return errors.Wrapf(err, "list block dirs in %q", dir)
}
wrapErr := func(err error, d string) error {
return errors.Wrapf(err, "block dir: %q", d)
}
tmpFiles := make([]string, 0, len(dir))
defer func() {
for _, tmp := range tmpFiles {
if err := os.RemoveAll(tmp); err != nil {
level.Error(logger).Log("msg", "remove tmp file", "err", err.Error())
}
}
}()
for _, d := range dirs {
meta, err := readBogusMetaFile(d)
if err != nil {
return wrapErr(err, d)
}
if meta.Version == 1 {
level.Info(logger).Log(
"msg", "found healthy block",
"mint", meta.MinTime,
"maxt", meta.MaxTime,
"ulid", meta.ULID,
)
continue
}
level.Info(logger).Log(
"msg", "fixing broken block",
"mint", meta.MinTime,
"maxt", meta.MaxTime,
"ulid", meta.ULID,
)
repl, err := os.Create(filepath.Join(d, "index.repaired"))
if err != nil {
return wrapErr(err, d)
}
tmpFiles = append(tmpFiles, repl.Name())
broken, err := os.Open(filepath.Join(d, indexFilename))
if err != nil {
return wrapErr(err, d)
}
if _, err := io.Copy(repl, broken); err != nil {
return wrapErr(err, d)
}
var merr tsdb_errors.MultiError
// Set the 5th byte to 2 to indicate the correct file format version.
if _, err := repl.WriteAt([]byte{2}, 4); err != nil {
merr.Add(wrapErr(err, d))
merr.Add(wrapErr(repl.Close(), d))
return merr.Err()
}
if err := repl.Sync(); err != nil {
merr.Add(wrapErr(err, d))
merr.Add(wrapErr(repl.Close(), d))
return merr.Err()
}
if err := repl.Close(); err != nil {
return wrapErr(err, d)
}
if err := broken.Close(); err != nil {
return wrapErr(err, d)
}
if err := fileutil.Replace(repl.Name(), broken.Name()); err != nil {
return wrapErr(err, d)
}
// Reset version of meta.json to 1.
meta.Version = 1
if _, err := writeMetaFile(logger, d, meta); err != nil {
return wrapErr(err, d)
}
}
return nil
}
func readBogusMetaFile(dir string) (*BlockMeta, error) {
b, err := ioutil.ReadFile(filepath.Join(dir, metaFilename))
if err != nil {
return nil, err
}
var m BlockMeta
if err := json.Unmarshal(b, &m); err != nil {
return nil, err
}
if m.Version != 1 && m.Version != 2 {
return nil, errors.Errorf("unexpected meta file version %d", m.Version)
}
return &m, nil
}

127
tsdb/repair_test.go Normal file
View file

@ -0,0 +1,127 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"os"
"path/filepath"
"testing"
"github.com/prometheus/tsdb/chunks"
"github.com/prometheus/tsdb/fileutil"
"github.com/prometheus/tsdb/index"
"github.com/prometheus/tsdb/labels"
"github.com/prometheus/tsdb/testutil"
)
func TestRepairBadIndexVersion(t *testing.T) {
// The broken index used in this test was written by the following script
// at a broken revision.
//
// func main() {
// w, err := index.NewWriter(indexFilename)
// if err != nil {
// panic(err)
// }
// err = w.AddSymbols(map[string]struct{}{
// "a": struct{}{},
// "b": struct{}{},
// "1": struct{}{},
// "2": struct{}{},
// })
// if err != nil {
// panic(err)
// }
// err = w.AddSeries(1, labels.FromStrings("a", "1", "b", "1"))
// if err != nil {
// panic(err)
// }
// err = w.AddSeries(2, labels.FromStrings("a", "2", "b", "1"))
// if err != nil {
// panic(err)
// }
// err = w.WritePostings("b", "1", index.NewListPostings([]uint64{1, 2}))
// if err != nil {
// panic(err)
// }
// if err := w.Close(); err != nil {
// panic(err)
// }
// }
dbDir := filepath.Join("testdata", "repair_index_version", "01BZJ9WJQPWHGNC2W4J9TA62KC")
tmpDir := filepath.Join("testdata", "repair_index_version", "copy")
tmpDbDir := filepath.Join(tmpDir, "3MCNSQ8S31EHGJYWK5E1GPJWJZ")
// Check the current db.
// In its current state, lookups should fail with the fixed code.
_, _, err := readMetaFile(dbDir)
testutil.NotOk(t, err)
// Touch chunks dir in block.
testutil.Ok(t, os.MkdirAll(filepath.Join(dbDir, "chunks"), 0777))
defer func() {
testutil.Ok(t, os.RemoveAll(filepath.Join(dbDir, "chunks")))
}()
r, err := index.NewFileReader(filepath.Join(dbDir, indexFilename))
testutil.Ok(t, err)
p, err := r.Postings("b", "1")
testutil.Ok(t, err)
for p.Next() {
t.Logf("next ID %d", p.At())
var lset labels.Labels
testutil.NotOk(t, r.Series(p.At(), &lset, nil))
}
testutil.Ok(t, p.Err())
testutil.Ok(t, r.Close())
// Create a copy DB to run test against.
if err = fileutil.CopyDirs(dbDir, tmpDbDir); err != nil {
t.Fatal(err)
}
defer func() {
testutil.Ok(t, os.RemoveAll(tmpDir))
}()
// On DB opening all blocks in the base dir should be repaired.
db, err := Open(tmpDir, nil, nil, nil)
testutil.Ok(t, err)
db.Close()
r, err = index.NewFileReader(filepath.Join(tmpDbDir, indexFilename))
testutil.Ok(t, err)
defer r.Close()
p, err = r.Postings("b", "1")
testutil.Ok(t, err)
res := []labels.Labels{}
for p.Next() {
t.Logf("next ID %d", p.At())
var lset labels.Labels
var chks []chunks.Meta
testutil.Ok(t, r.Series(p.At(), &lset, &chks))
res = append(res, lset)
}
testutil.Ok(t, p.Err())
testutil.Equals(t, []labels.Labels{
{{"a", "1"}, {"b", "1"}},
{{"a", "2"}, {"b", "1"}},
}, res)
meta, _, err := readMetaFile(tmpDbDir)
testutil.Ok(t, err)
testutil.Assert(t, meta.Version == 1, "unexpected meta version %d", meta.Version)
}

58
tsdb/test/conv_test.go Normal file
View file

@ -0,0 +1,58 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package test
import "testing"
func BenchmarkMapConversion(b *testing.B) {
type key string
type val string
m := map[key]val{
"job": "node",
"instance": "123.123.1.211:9090",
"path": "/api/v1/namespaces/<namespace>/deployments/<name>",
"method": "GET",
"namespace": "system",
"status": "500",
}
var sm map[string]string
for i := 0; i < b.N; i++ {
sm = make(map[string]string, len(m))
for k, v := range m {
sm[string(k)] = string(v)
}
}
}
func BenchmarkListIter(b *testing.B) {
var list []uint32
for i := 0; i < 1e4; i++ {
list = append(list, uint32(i))
}
b.ResetTimer()
total := uint32(0)
for j := 0; j < b.N; j++ {
sum := uint32(0)
for _, k := range list {
sum += k
}
total += sum
}
}

124
tsdb/test/hash_test.go Normal file
View file

@ -0,0 +1,124 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package test
import (
"crypto/rand"
"fmt"
"hash/crc32"
"testing"
"github.com/cespare/xxhash"
sip13 "github.com/dgryski/go-sip13"
)
type pair struct {
name, value string
}
var testInput = []pair{
{"job", "node"},
{"instance", "123.123.1.211:9090"},
{"path", "/api/v1/namespaces/<namespace>/deployments/<name>"},
{"method", "GET"},
{"namespace", "system"},
{"status", "500"},
}
func BenchmarkHash(b *testing.B) {
input := []byte{}
for _, v := range testInput {
input = append(input, v.name...)
input = append(input, '\xff')
input = append(input, v.value...)
input = append(input, '\xff')
}
var total uint64
var k0 uint64 = 0x0706050403020100
var k1 uint64 = 0x0f0e0d0c0b0a0908
for name, f := range map[string]func(b []byte) uint64{
"xxhash": xxhash.Sum64,
"fnv64": fnv64a,
"sip13": func(b []byte) uint64 { return sip13.Sum64(k0, k1, b) },
} {
b.Run(name, func(b *testing.B) {
b.SetBytes(int64(len(input)))
total = 0
for i := 0; i < b.N; i++ {
total += f(input)
}
})
}
}
// hashAdd adds a string to a fnv64a hash value, returning the updated hash.
func fnv64a(b []byte) uint64 {
const (
offset64 = 14695981039346656037
prime64 = 1099511628211
)
h := uint64(offset64)
for x := range b {
h ^= uint64(x)
h *= prime64
}
return h
}
func BenchmarkCRC32_diff(b *testing.B) {
data := [][]byte{}
for i := 0; i < 1000; i++ {
b := make([]byte, 512)
rand.Read(b)
data = append(data, b)
}
ctab := crc32.MakeTable(crc32.Castagnoli)
total := uint32(0)
b.Run("direct", func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
total += crc32.Checksum(data[i%1000], ctab)
}
})
b.Run("hash-reuse", func(b *testing.B) {
b.ReportAllocs()
h := crc32.New(ctab)
for i := 0; i < b.N; i++ {
h.Reset()
h.Write(data[i%1000])
total += h.Sum32()
}
})
b.Run("hash-new", func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
h := crc32.New(ctab)
h.Write(data[i%1000])
total += h.Sum32()
}
})
fmt.Println(total)
}

216
tsdb/test/labels_test.go Normal file
View file

@ -0,0 +1,216 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package test
import (
"bytes"
"crypto/rand"
"testing"
"github.com/prometheus/tsdb/labels"
)
func BenchmarkMapClone(b *testing.B) {
m := map[string]string{
"job": "node",
"instance": "123.123.1.211:9090",
"path": "/api/v1/namespaces/<namespace>/deployments/<name>",
"method": "GET",
"namespace": "system",
"status": "500",
"prometheus": "prometheus-core-1",
"datacenter": "eu-west-1",
"pod_name": "abcdef-99999-defee",
}
for i := 0; i < b.N; i++ {
res := make(map[string]string, len(m))
for k, v := range m {
res[k] = v
}
m = res
}
}
func BenchmarkLabelsClone(b *testing.B) {
m := map[string]string{
"job": "node",
"instance": "123.123.1.211:9090",
"path": "/api/v1/namespaces/<namespace>/deployments/<name>",
"method": "GET",
"namespace": "system",
"status": "500",
"prometheus": "prometheus-core-1",
"datacenter": "eu-west-1",
"pod_name": "abcdef-99999-defee",
}
l := labels.FromMap(m)
for i := 0; i < b.N; i++ {
res := make(labels.Labels, len(l))
copy(res, l)
l = res
}
}
func BenchmarkLabelMapAccess(b *testing.B) {
m := map[string]string{
"job": "node",
"instance": "123.123.1.211:9090",
"path": "/api/v1/namespaces/<namespace>/deployments/<name>",
"method": "GET",
"namespace": "system",
"status": "500",
"prometheus": "prometheus-core-1",
"datacenter": "eu-west-1",
"pod_name": "abcdef-99999-defee",
}
var v string
for k := range m {
b.Run(k, func(b *testing.B) {
for i := 0; i < b.N; i++ {
v = m[k]
}
})
}
_ = v
}
func BenchmarkLabelSetAccess(b *testing.B) {
m := map[string]string{
"job": "node",
"instance": "123.123.1.211:9090",
"path": "/api/v1/namespaces/<namespace>/deployments/<name>",
"method": "GET",
"namespace": "system",
"status": "500",
"prometheus": "prometheus-core-1",
"datacenter": "eu-west-1",
"pod_name": "abcdef-99999-defee",
}
ls := labels.FromMap(m)
var v string
for _, l := range ls {
b.Run(l.Name, func(b *testing.B) {
for i := 0; i < b.N; i++ {
v = ls.Get(l.Name)
}
})
}
_ = v
}
func BenchmarkStringBytesEquals(b *testing.B) {
randBytes := func(n int) ([]byte, []byte) {
buf1 := make([]byte, n)
if _, err := rand.Read(buf1); err != nil {
b.Fatal(err)
}
buf2 := make([]byte, n)
copy(buf1, buf2)
return buf1, buf2
}
cases := []struct {
name string
f func() ([]byte, []byte)
}{
{
name: "equal",
f: func() ([]byte, []byte) {
return randBytes(60)
},
},
{
name: "1-flip-end",
f: func() ([]byte, []byte) {
b1, b2 := randBytes(60)
b2[59] ^= b2[59]
return b1, b2
},
},
{
name: "1-flip-middle",
f: func() ([]byte, []byte) {
b1, b2 := randBytes(60)
b2[29] ^= b2[29]
return b1, b2
},
},
{
name: "1-flip-start",
f: func() ([]byte, []byte) {
b1, b2 := randBytes(60)
b2[0] ^= b2[0]
return b1, b2
},
},
{
name: "different-length",
f: func() ([]byte, []byte) {
b1, b2 := randBytes(60)
return b1, b2[:59]
},
},
}
for _, c := range cases {
b.Run(c.name+"-strings", func(b *testing.B) {
ab, bb := c.f()
as, bs := string(ab), string(bb)
b.SetBytes(int64(len(as)))
var r bool
for i := 0; i < b.N; i++ {
r = as == bs
}
_ = r
})
b.Run(c.name+"-bytes", func(b *testing.B) {
ab, bb := c.f()
b.SetBytes(int64(len(ab)))
var r bool
for i := 0; i < b.N; i++ {
r = bytes.Equal(ab, bb)
}
_ = r
})
b.Run(c.name+"-bytes-length-check", func(b *testing.B) {
ab, bb := c.f()
b.SetBytes(int64(len(ab)))
var r bool
for i := 0; i < b.N; i++ {
if len(ab) != len(bb) {
continue
}
r = bytes.Equal(ab, bb)
}
_ = r
})
}
}

20000
tsdb/testdata/20kseries.json vendored Normal file

File diff suppressed because it is too large Load diff

Binary file not shown.

View file

@ -0,0 +1,17 @@
{
"version": 2,
"ulid": "01BZJ9WJR6Z192734YNMD62F6M",
"minTime": 1511366400000,
"maxTime": 1511368200000,
"stats": {
"numSamples": 31897565,
"numSeries": 88910,
"numChunks": 266093
},
"compaction": {
"level": 1,
"sources": [
"01BZJ9WJR6Z192734YNMD62F6M"
]
}
}

182
tsdb/testutil/directory.go Normal file
View file

@ -0,0 +1,182 @@
// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package testutil
import (
"crypto/sha256"
"io"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"testing"
)
const (
// The base directory used for test emissions, which instructs the operating
// system to use the default temporary directory as the base or TMPDIR
// environment variable.
defaultDirectory = ""
// NilCloser is a no-op Closer.
NilCloser = nilCloser(true)
// The number of times that a TemporaryDirectory will retry its removal
temporaryDirectoryRemoveRetries = 2
)
type (
// Closer is the interface that wraps the Close method.
Closer interface {
// Close reaps the underlying directory and its children. The directory
// could be deleted by its users already.
Close()
}
nilCloser bool
// TemporaryDirectory models a closeable path for transient POSIX disk
// activities.
TemporaryDirectory interface {
Closer
// Path returns the underlying path for access.
Path() string
}
// temporaryDirectory is kept as a private type due to private fields and
// their interactions.
temporaryDirectory struct {
path string
tester T
}
callbackCloser struct {
fn func()
}
// T implements the needed methods of testing.TB so that we do not need
// to actually import testing (which has the side effect of adding all
// the test flags, which we do not want in non-test binaries even if
// they make use of these utilities for some reason).
T interface {
Fatal(args ...interface{})
Fatalf(format string, args ...interface{})
}
)
func (c nilCloser) Close() {
}
func (c callbackCloser) Close() {
c.fn()
}
// NewCallbackCloser returns a Closer that calls the provided function upon
// closing.
func NewCallbackCloser(fn func()) Closer {
return &callbackCloser{
fn: fn,
}
}
func (t temporaryDirectory) Close() {
retries := temporaryDirectoryRemoveRetries
err := os.RemoveAll(t.path)
for err != nil && retries > 0 {
switch {
case os.IsNotExist(err):
err = nil
default:
retries--
err = os.RemoveAll(t.path)
}
}
if err != nil {
t.tester.Fatal(err)
}
}
func (t temporaryDirectory) Path() string {
return t.path
}
// NewTemporaryDirectory creates a new temporary directory for transient POSIX
// activities.
func NewTemporaryDirectory(name string, t T) (handler TemporaryDirectory) {
var (
directory string
err error
)
directory, err = ioutil.TempDir(defaultDirectory, name)
if err != nil {
t.Fatal(err)
}
handler = temporaryDirectory{
path: directory,
tester: t,
}
return
}
// DirSize returns the size in bytes of all files in a directory.
func DirSize(t *testing.T, path string) int64 {
var size int64
err := filepath.Walk(path, func(_ string, info os.FileInfo, err error) error {
Ok(t, err)
if !info.IsDir() {
size += info.Size()
}
return nil
})
Ok(t, err)
return size
}
// DirHash returns a hash of all files attribites and their content within a directory.
func DirHash(t *testing.T, path string) []byte {
hash := sha256.New()
err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error {
Ok(t, err)
if info.IsDir() {
return nil
}
f, err := os.Open(path)
Ok(t, err)
defer f.Close()
_, err = io.Copy(hash, f)
Ok(t, err)
_, err = io.WriteString(hash, strconv.Itoa(int(info.Size())))
Ok(t, err)
_, err = io.WriteString(hash, info.Name())
Ok(t, err)
modTime, err := info.ModTime().GobEncode()
Ok(t, err)
_, err = io.WriteString(hash, string(modTime))
Ok(t, err)
return nil
})
Ok(t, err)
return hash.Sum(nil)
}

35
tsdb/testutil/logging.go Normal file
View file

@ -0,0 +1,35 @@
// Copyright 2019 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package testutil
import (
"testing"
"github.com/go-kit/kit/log"
)
type logger struct {
t *testing.T
}
// NewLogger returns a gokit compatible Logger which calls t.Log.
func NewLogger(t *testing.T) log.Logger {
return logger{t: t}
}
// Log implements log.Logger.
func (t logger) Log(keyvals ...interface{}) error {
t.t.Log(keyvals...)
return nil
}

87
tsdb/testutil/testutil.go Normal file
View file

@ -0,0 +1,87 @@
// The MIT License (MIT)
// Copyright (c) 2014 Ben Johnson
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
package testutil
import (
"fmt"
"path/filepath"
"reflect"
"runtime"
"testing"
)
// Assert fails the test if the condition is false.
func Assert(tb testing.TB, condition bool, msg string, v ...interface{}) {
if !condition {
_, file, line, _ := runtime.Caller(1)
fmt.Printf("\033[31m%s:%d: "+msg+"\033[39m\n\n", append([]interface{}{filepath.Base(file), line}, v...)...)
tb.FailNow()
}
}
// Ok fails the test if an err is not nil.
func Ok(tb testing.TB, err error) {
if err != nil {
_, file, line, _ := runtime.Caller(1)
fmt.Printf("\033[31m%s:%d: unexpected error: %s\033[39m\n\n", filepath.Base(file), line, err.Error())
tb.FailNow()
}
}
// NotOk fails the test if an err is nil.
func NotOk(tb testing.TB, err error) {
if err == nil {
_, file, line, _ := runtime.Caller(1)
fmt.Printf("\033[31m%s:%d: expected error, got nothing \033[39m\n\n", filepath.Base(file), line)
tb.FailNow()
}
}
// Equals fails the test if exp is not equal to act.
func Equals(tb testing.TB, exp, act interface{}, msgAndArgs ...interface{}) {
if !reflect.DeepEqual(exp, act) {
_, file, line, _ := runtime.Caller(1)
fmt.Printf("\033[31m%s:%d:%s\n\n\texp: %#v\n\n\tgot: %#v\033[39m\n\n", filepath.Base(file), line, formatMessage(msgAndArgs), exp, act)
tb.FailNow()
}
}
// NotEquals fails the test if exp is equal to act.
func NotEquals(tb testing.TB, exp, act interface{}) {
if reflect.DeepEqual(exp, act) {
_, file, line, _ := runtime.Caller(1)
fmt.Printf("\033[31m%s:%d: Expected different exp and got\n\n\texp: %#v\n\n\tgot: %#v\033[39m\n\n", filepath.Base(file), line, exp, act)
tb.FailNow()
}
}
func formatMessage(msgAndArgs []interface{}) string {
if len(msgAndArgs) == 0 {
return ""
}
if msg, ok := msgAndArgs[0].(string); ok {
return fmt.Sprintf("\n\nmsg: "+msg, msgAndArgs[1:]...)
}
return ""
}

304
tsdb/tombstones.go Normal file
View file

@ -0,0 +1,304 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"encoding/binary"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"sync"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/pkg/errors"
"github.com/prometheus/tsdb/encoding"
tsdb_errors "github.com/prometheus/tsdb/errors"
"github.com/prometheus/tsdb/fileutil"
)
const tombstoneFilename = "tombstones"
const (
// MagicTombstone is 4 bytes at the head of a tombstone file.
MagicTombstone = 0x0130BA30
tombstoneFormatV1 = 1
)
// TombstoneReader gives access to tombstone intervals by series reference.
type TombstoneReader interface {
// Get returns deletion intervals for the series with the given reference.
Get(ref uint64) (Intervals, error)
// Iter calls the given function for each encountered interval.
Iter(func(uint64, Intervals) error) error
// Total returns the total count of tombstones.
Total() uint64
// Close any underlying resources
Close() error
}
func writeTombstoneFile(logger log.Logger, dir string, tr TombstoneReader) (int64, error) {
path := filepath.Join(dir, tombstoneFilename)
tmp := path + ".tmp"
hash := newCRC32()
var size int
f, err := os.Create(tmp)
if err != nil {
return 0, err
}
defer func() {
if f != nil {
if err := f.Close(); err != nil {
level.Error(logger).Log("msg", "close tmp file", "err", err.Error())
}
}
if err := os.RemoveAll(tmp); err != nil {
level.Error(logger).Log("msg", "remove tmp file", "err", err.Error())
}
}()
buf := encoding.Encbuf{B: make([]byte, 3*binary.MaxVarintLen64)}
buf.Reset()
// Write the meta.
buf.PutBE32(MagicTombstone)
buf.PutByte(tombstoneFormatV1)
n, err := f.Write(buf.Get())
if err != nil {
return 0, err
}
size += n
mw := io.MultiWriter(f, hash)
if err := tr.Iter(func(ref uint64, ivs Intervals) error {
for _, iv := range ivs {
buf.Reset()
buf.PutUvarint64(ref)
buf.PutVarint64(iv.Mint)
buf.PutVarint64(iv.Maxt)
n, err = mw.Write(buf.Get())
if err != nil {
return err
}
size += n
}
return nil
}); err != nil {
return 0, fmt.Errorf("error writing tombstones: %v", err)
}
n, err = f.Write(hash.Sum(nil))
if err != nil {
return 0, err
}
size += n
var merr tsdb_errors.MultiError
if merr.Add(f.Sync()); merr.Err() != nil {
merr.Add(f.Close())
return 0, merr.Err()
}
if err = f.Close(); err != nil {
return 0, err
}
f = nil
return int64(size), fileutil.Replace(tmp, path)
}
// Stone holds the information on the posting and time-range
// that is deleted.
type Stone struct {
ref uint64
intervals Intervals
}
func readTombstones(dir string) (TombstoneReader, int64, error) {
b, err := ioutil.ReadFile(filepath.Join(dir, tombstoneFilename))
if os.IsNotExist(err) {
return newMemTombstones(), 0, nil
} else if err != nil {
return nil, 0, err
}
if len(b) < 5 {
return nil, 0, errors.Wrap(encoding.ErrInvalidSize, "tombstones header")
}
d := &encoding.Decbuf{B: b[:len(b)-4]} // 4 for the checksum.
if mg := d.Be32(); mg != MagicTombstone {
return nil, 0, fmt.Errorf("invalid magic number %x", mg)
}
if flag := d.Byte(); flag != tombstoneFormatV1 {
return nil, 0, fmt.Errorf("invalid tombstone format %x", flag)
}
if d.Err() != nil {
return nil, 0, d.Err()
}
// Verify checksum.
hash := newCRC32()
if _, err := hash.Write(d.Get()); err != nil {
return nil, 0, errors.Wrap(err, "write to hash")
}
if binary.BigEndian.Uint32(b[len(b)-4:]) != hash.Sum32() {
return nil, 0, errors.New("checksum did not match")
}
stonesMap := newMemTombstones()
for d.Len() > 0 {
k := d.Uvarint64()
mint := d.Varint64()
maxt := d.Varint64()
if d.Err() != nil {
return nil, 0, d.Err()
}
stonesMap.addInterval(k, Interval{mint, maxt})
}
return stonesMap, int64(len(b)), nil
}
type memTombstones struct {
intvlGroups map[uint64]Intervals
mtx sync.RWMutex
}
// newMemTombstones creates new in memory TombstoneReader
// that allows adding new intervals.
func newMemTombstones() *memTombstones {
return &memTombstones{intvlGroups: make(map[uint64]Intervals)}
}
func (t *memTombstones) Get(ref uint64) (Intervals, error) {
t.mtx.RLock()
defer t.mtx.RUnlock()
return t.intvlGroups[ref], nil
}
func (t *memTombstones) Iter(f func(uint64, Intervals) error) error {
t.mtx.RLock()
defer t.mtx.RUnlock()
for ref, ivs := range t.intvlGroups {
if err := f(ref, ivs); err != nil {
return err
}
}
return nil
}
func (t *memTombstones) Total() uint64 {
t.mtx.RLock()
defer t.mtx.RUnlock()
total := uint64(0)
for _, ivs := range t.intvlGroups {
total += uint64(len(ivs))
}
return total
}
// addInterval to an existing memTombstones
func (t *memTombstones) addInterval(ref uint64, itvs ...Interval) {
t.mtx.Lock()
defer t.mtx.Unlock()
for _, itv := range itvs {
t.intvlGroups[ref] = t.intvlGroups[ref].add(itv)
}
}
func (*memTombstones) Close() error {
return nil
}
// Interval represents a single time-interval.
type Interval struct {
Mint, Maxt int64
}
func (tr Interval) inBounds(t int64) bool {
return t >= tr.Mint && t <= tr.Maxt
}
func (tr Interval) isSubrange(dranges Intervals) bool {
for _, r := range dranges {
if r.inBounds(tr.Mint) && r.inBounds(tr.Maxt) {
return true
}
}
return false
}
// Intervals represents a set of increasing and non-overlapping time-intervals.
type Intervals []Interval
// add the new time-range to the existing ones.
// The existing ones must be sorted.
func (itvs Intervals) add(n Interval) Intervals {
for i, r := range itvs {
// TODO(gouthamve): Make this codepath easier to digest.
if r.inBounds(n.Mint-1) || r.inBounds(n.Mint) {
if n.Maxt > r.Maxt {
itvs[i].Maxt = n.Maxt
}
j := 0
for _, r2 := range itvs[i+1:] {
if n.Maxt < r2.Mint {
break
}
j++
}
if j != 0 {
if itvs[i+j].Maxt > n.Maxt {
itvs[i].Maxt = itvs[i+j].Maxt
}
itvs = append(itvs[:i+1], itvs[i+j+1:]...)
}
return itvs
}
if r.inBounds(n.Maxt+1) || r.inBounds(n.Maxt) {
if n.Mint < r.Maxt {
itvs[i].Mint = n.Mint
}
return itvs
}
if n.Mint < r.Mint {
newRange := make(Intervals, i, len(itvs[:i])+1)
copy(newRange, itvs[:i])
newRange = append(newRange, n)
newRange = append(newRange, itvs[i:]...)
return newRange
}
}
itvs = append(itvs, n)
return itvs
}

150
tsdb/tombstones_test.go Normal file
View file

@ -0,0 +1,150 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"io/ioutil"
"math/rand"
"os"
"sync"
"testing"
"time"
"github.com/go-kit/kit/log"
"github.com/prometheus/tsdb/testutil"
)
func TestWriteAndReadbackTombStones(t *testing.T) {
tmpdir, _ := ioutil.TempDir("", "test")
defer func() {
testutil.Ok(t, os.RemoveAll(tmpdir))
}()
ref := uint64(0)
stones := newMemTombstones()
// Generate the tombstones.
for i := 0; i < 100; i++ {
ref += uint64(rand.Int31n(10)) + 1
numRanges := rand.Intn(5) + 1
dranges := make(Intervals, 0, numRanges)
mint := rand.Int63n(time.Now().UnixNano())
for j := 0; j < numRanges; j++ {
dranges = dranges.add(Interval{mint, mint + rand.Int63n(1000)})
mint += rand.Int63n(1000) + 1
}
stones.addInterval(ref, dranges...)
}
_, err := writeTombstoneFile(log.NewNopLogger(), tmpdir, stones)
testutil.Ok(t, err)
restr, _, err := readTombstones(tmpdir)
testutil.Ok(t, err)
// Compare the two readers.
testutil.Equals(t, stones, restr)
}
func TestAddingNewIntervals(t *testing.T) {
cases := []struct {
exist Intervals
new Interval
exp Intervals
}{
{
new: Interval{1, 2},
exp: Intervals{{1, 2}},
},
{
exist: Intervals{{1, 2}},
new: Interval{1, 2},
exp: Intervals{{1, 2}},
},
{
exist: Intervals{{1, 4}, {6, 6}},
new: Interval{5, 6},
exp: Intervals{{1, 6}},
},
{
exist: Intervals{{1, 10}, {12, 20}, {25, 30}},
new: Interval{21, 23},
exp: Intervals{{1, 10}, {12, 23}, {25, 30}},
},
{
exist: Intervals{{1, 2}, {3, 5}, {7, 7}},
new: Interval{6, 7},
exp: Intervals{{1, 2}, {3, 7}},
},
{
exist: Intervals{{1, 10}, {12, 20}, {25, 30}},
new: Interval{21, 25},
exp: Intervals{{1, 10}, {12, 30}},
},
{
exist: Intervals{{1, 10}, {12, 20}, {25, 30}},
new: Interval{18, 23},
exp: Intervals{{1, 10}, {12, 23}, {25, 30}},
},
{
exist: Intervals{{1, 10}, {12, 20}, {25, 30}},
new: Interval{9, 23},
exp: Intervals{{1, 23}, {25, 30}},
},
{
exist: Intervals{{1, 10}, {12, 20}, {25, 30}},
new: Interval{9, 230},
exp: Intervals{{1, 230}},
},
{
exist: Intervals{{5, 10}, {12, 20}, {25, 30}},
new: Interval{1, 4},
exp: Intervals{{1, 10}, {12, 20}, {25, 30}},
},
{
exist: Intervals{{5, 10}, {12, 20}, {25, 30}},
new: Interval{11, 14},
exp: Intervals{{5, 20}, {25, 30}},
},
}
for _, c := range cases {
testutil.Equals(t, c.exp, c.exist.add(c.new))
}
}
// TestMemTombstonesConcurrency to make sure they are safe to access from different goroutines.
func TestMemTombstonesConcurrency(t *testing.T) {
tomb := newMemTombstones()
totalRuns := 100
var wg sync.WaitGroup
wg.Add(2)
go func() {
for x := 0; x < totalRuns; x++ {
tomb.addInterval(uint64(x), Interval{int64(x), int64(x)})
}
wg.Done()
}()
go func() {
for x := 0; x < totalRuns; x++ {
_, err := tomb.Get(uint64(x))
testutil.Ok(t, err)
}
wg.Done()
}()
wg.Wait()
}

236
tsdb/tsdbutil/buffer.go Normal file
View file

@ -0,0 +1,236 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdbutil
import (
"math"
)
// SeriesIterator iterates over the data of a time series.
type SeriesIterator interface {
// Seek advances the iterator forward to the given timestamp.
// If there's no value exactly at t, it advances to the first value
// after t.
Seek(t int64) bool
// At returns the current timestamp/value pair.
At() (t int64, v float64)
// Next advances the iterator by one.
Next() bool
// Err returns the current error.
Err() error
}
// BufferedSeriesIterator wraps an iterator with a look-back buffer.
type BufferedSeriesIterator struct {
it SeriesIterator
buf *sampleRing
lastTime int64
}
// NewBuffer returns a new iterator that buffers the values within the time range
// of the current element and the duration of delta before.
func NewBuffer(it SeriesIterator, delta int64) *BufferedSeriesIterator {
return &BufferedSeriesIterator{
it: it,
buf: newSampleRing(delta, 16),
lastTime: math.MinInt64,
}
}
// PeekBack returns the previous element of the iterator. If there is none buffered,
// ok is false.
func (b *BufferedSeriesIterator) PeekBack() (t int64, v float64, ok bool) {
return b.buf.last()
}
// Buffer returns an iterator over the buffered data.
func (b *BufferedSeriesIterator) Buffer() SeriesIterator {
return b.buf.iterator()
}
// Seek advances the iterator to the element at time t or greater.
func (b *BufferedSeriesIterator) Seek(t int64) bool {
t0 := t - b.buf.delta
// If the delta would cause us to seek backwards, preserve the buffer
// and just continue regular advancement while filling the buffer on the way.
if t0 > b.lastTime {
b.buf.reset()
ok := b.it.Seek(t0)
if !ok {
return false
}
b.lastTime, _ = b.At()
}
if b.lastTime >= t {
return true
}
for b.Next() {
if b.lastTime >= t {
return true
}
}
return false
}
// Next advances the iterator to the next element.
func (b *BufferedSeriesIterator) Next() bool {
// Add current element to buffer before advancing.
b.buf.add(b.it.At())
ok := b.it.Next()
if ok {
b.lastTime, _ = b.At()
}
return ok
}
// At returns the current element of the iterator.
func (b *BufferedSeriesIterator) At() (int64, float64) {
return b.it.At()
}
// Err returns the last encountered error.
func (b *BufferedSeriesIterator) Err() error {
return b.it.Err()
}
type sample struct {
t int64
v float64
}
func (s sample) T() int64 {
return s.t
}
func (s sample) V() float64 {
return s.v
}
type sampleRing struct {
delta int64
buf []sample // lookback buffer
i int // position of most recent element in ring buffer
f int // position of first element in ring buffer
l int // number of elements in buffer
}
func newSampleRing(delta int64, sz int) *sampleRing {
r := &sampleRing{delta: delta, buf: make([]sample, sz)}
r.reset()
return r
}
func (r *sampleRing) reset() {
r.l = 0
r.i = -1
r.f = 0
}
func (r *sampleRing) iterator() SeriesIterator {
return &sampleRingIterator{r: r, i: -1}
}
type sampleRingIterator struct {
r *sampleRing
i int
}
func (it *sampleRingIterator) Next() bool {
it.i++
return it.i < it.r.l
}
func (it *sampleRingIterator) Seek(int64) bool {
return false
}
func (it *sampleRingIterator) Err() error {
return nil
}
func (it *sampleRingIterator) At() (int64, float64) {
return it.r.at(it.i)
}
func (r *sampleRing) at(i int) (int64, float64) {
j := (r.f + i) % len(r.buf)
s := r.buf[j]
return s.t, s.v
}
// add adds a sample to the ring buffer and frees all samples that fall
// out of the delta range.
func (r *sampleRing) add(t int64, v float64) {
l := len(r.buf)
// Grow the ring buffer if it fits no more elements.
if l == r.l {
buf := make([]sample, 2*l)
copy(buf[l+r.f:], r.buf[r.f:])
copy(buf, r.buf[:r.f])
r.buf = buf
r.i = r.f
r.f += l
} else {
r.i++
if r.i >= l {
r.i -= l
}
}
r.buf[r.i] = sample{t: t, v: v}
r.l++
// Free head of the buffer of samples that just fell out of the range.
for r.buf[r.f].t < t-r.delta {
r.f++
if r.f >= l {
r.f -= l
}
r.l--
}
}
// last returns the most recent element added to the ring.
func (r *sampleRing) last() (int64, float64, bool) {
if r.l == 0 {
return 0, 0, false
}
s := r.buf[r.i]
return s.t, s.v, true
}
func (r *sampleRing) samples() []sample {
res := make([]sample, r.l)
var k = r.f + r.l
var j int
if k > len(r.buf) {
k = len(r.buf)
j = r.l - k + r.f
}
n := copy(res, r.buf[r.f:k])
copy(res[n:], r.buf[:j])
return res
}

View file

@ -0,0 +1,173 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdbutil
import (
"math/rand"
"sort"
"testing"
"github.com/prometheus/tsdb/testutil"
)
func TestSampleRing(t *testing.T) {
cases := []struct {
input []int64
delta int64
size int
}{
{
input: []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
delta: 2,
size: 1,
},
{
input: []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
delta: 2,
size: 2,
},
{
input: []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
delta: 7,
size: 3,
},
{
input: []int64{1, 2, 3, 4, 5, 16, 17, 18, 19, 20},
delta: 7,
size: 1,
},
}
for _, c := range cases {
r := newSampleRing(c.delta, c.size)
input := []sample{}
for _, t := range c.input {
input = append(input, sample{
t: t,
v: float64(rand.Intn(100)),
})
}
for i, s := range input {
r.add(s.t, s.v)
buffered := r.samples()
for _, sold := range input[:i] {
found := false
for _, bs := range buffered {
if bs.t == sold.t && bs.v == sold.v {
found = true
break
}
}
if sold.t >= s.t-c.delta && !found {
t.Fatalf("%d: expected sample %d to be in buffer but was not; buffer %v", i, sold.t, buffered)
}
if sold.t < s.t-c.delta && found {
t.Fatalf("%d: unexpected sample %d in buffer; buffer %v", i, sold.t, buffered)
}
}
}
}
}
func TestBufferedSeriesIterator(t *testing.T) {
var it *BufferedSeriesIterator
bufferEq := func(exp []sample) {
var b []sample
bit := it.Buffer()
for bit.Next() {
t, v := bit.At()
b = append(b, sample{t: t, v: v})
}
testutil.Equals(t, exp, b)
}
sampleEq := func(ets int64, ev float64) {
ts, v := it.At()
testutil.Equals(t, ets, ts)
testutil.Equals(t, ev, v)
}
it = NewBuffer(newListSeriesIterator([]sample{
{t: 1, v: 2},
{t: 2, v: 3},
{t: 3, v: 4},
{t: 4, v: 5},
{t: 5, v: 6},
{t: 99, v: 8},
{t: 100, v: 9},
{t: 101, v: 10},
}), 2)
testutil.Assert(t, it.Seek(-123) == true, "seek failed")
sampleEq(1, 2)
bufferEq(nil)
testutil.Assert(t, it.Next() == true, "next failed")
sampleEq(2, 3)
bufferEq([]sample{{t: 1, v: 2}})
testutil.Assert(t, it.Next() == true, "next failed")
testutil.Assert(t, it.Next() == true, "next failed")
testutil.Assert(t, it.Next() == true, "next failed")
sampleEq(5, 6)
bufferEq([]sample{{t: 2, v: 3}, {t: 3, v: 4}, {t: 4, v: 5}})
testutil.Assert(t, it.Seek(5) == true, "seek failed")
sampleEq(5, 6)
bufferEq([]sample{{t: 2, v: 3}, {t: 3, v: 4}, {t: 4, v: 5}})
testutil.Assert(t, it.Seek(101) == true, "seek failed")
sampleEq(101, 10)
bufferEq([]sample{{t: 99, v: 8}, {t: 100, v: 9}})
testutil.Assert(t, it.Next() == false, "next succeeded unexpectedly")
}
type listSeriesIterator struct {
list []sample
idx int
}
func newListSeriesIterator(list []sample) *listSeriesIterator {
return &listSeriesIterator{list: list, idx: -1}
}
func (it *listSeriesIterator) At() (int64, float64) {
s := it.list[it.idx]
return s.t, s.v
}
func (it *listSeriesIterator) Next() bool {
it.idx++
return it.idx < len(it.list)
}
func (it *listSeriesIterator) Seek(t int64) bool {
if it.idx == -1 {
it.idx = 0
}
// Do binary search between current position and end.
it.idx = sort.Search(len(it.list)-it.idx, func(i int) bool {
s := it.list[i+it.idx]
return s.t >= t
})
return it.idx < len(it.list)
}
func (it *listSeriesIterator) Err() error {
return nil
}

53
tsdb/tsdbutil/chunks.go Normal file
View file

@ -0,0 +1,53 @@
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdbutil
import (
"github.com/prometheus/tsdb/chunkenc"
"github.com/prometheus/tsdb/chunks"
)
type Sample interface {
T() int64
V() float64
}
func ChunkFromSamples(s []Sample) chunks.Meta {
mint, maxt := int64(0), int64(0)
if len(s) > 0 {
mint, maxt = s[0].T(), s[len(s)-1].T()
}
c := chunkenc.NewXORChunk()
ca, _ := c.Appender()
for _, s := range s {
ca.Append(s.T(), s.V())
}
return chunks.Meta{
MinTime: mint,
MaxTime: maxt,
Chunk: c,
}
}
// PopulatedChunk creates a chunk populated with samples every second starting at minTime
func PopulatedChunk(numSamples int, minTime int64) chunks.Meta {
samples := make([]Sample, numSamples)
for i := 0; i < numSamples; i++ {
samples[i] = sample{minTime + int64(i*1000), 1.0}
}
return ChunkFromSamples(samples)
}

1313
tsdb/wal.go Normal file

File diff suppressed because it is too large Load diff

322
tsdb/wal/live_reader.go Normal file
View file

@ -0,0 +1,322 @@
// Copyright 2019 The Prometheus Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"encoding/binary"
"fmt"
"hash/crc32"
"io"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/golang/snappy"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
)
// liveReaderMetrics holds all metrics exposed by the LiveReader.
type liveReaderMetrics struct {
readerCorruptionErrors *prometheus.CounterVec
}
// LiveReaderMetrics instatiates, registers and returns metrics to be injected
// at LiveReader instantiation.
func NewLiveReaderMetrics(reg prometheus.Registerer) *liveReaderMetrics {
m := &liveReaderMetrics{
readerCorruptionErrors: prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_reader_corruption_errors_total",
Help: "Errors encountered when reading the WAL.",
}, []string{"error"}),
}
if reg != nil {
reg.Register(m.readerCorruptionErrors)
}
return m
}
// NewLiveReader returns a new live reader.
func NewLiveReader(logger log.Logger, metrics *liveReaderMetrics, r io.Reader) *LiveReader {
lr := &LiveReader{
logger: logger,
rdr: r,
metrics: metrics,
// Until we understand how they come about, make readers permissive
// to records spanning pages.
permissive: true,
}
return lr
}
// LiveReader reads WAL records from an io.Reader. It allows reading of WALs
// that are still in the process of being written, and returns records as soon
// as they can be read.
type LiveReader struct {
logger log.Logger
rdr io.Reader
err error
rec []byte
snappyBuf []byte
hdr [recordHeaderSize]byte
buf [pageSize]byte
readIndex int // Index in buf to start at for next read.
writeIndex int // Index in buf to start at for next write.
total int64 // Total bytes processed during reading in calls to Next().
index int // Used to track partial records, should be 0 at the start of every new record.
// For testing, we can treat EOF as a non-error.
eofNonErr bool
// We sometime see records span page boundaries. Should never happen, but it
// does. Until we track down why, set permissive to true to tolerate it.
// NB the non-ive Reader implementation allows for this.
permissive bool
metrics *liveReaderMetrics
}
// Err returns any errors encountered reading the WAL. io.EOFs are not terminal
// and Next can be tried again. Non-EOFs are terminal, and the reader should
// not be used again. It is up to the user to decide when to stop trying should
// io.EOF be returned.
func (r *LiveReader) Err() error {
if r.eofNonErr && r.err == io.EOF {
return nil
}
return r.err
}
// Offset returns the number of bytes consumed from this segment.
func (r *LiveReader) Offset() int64 {
return r.total
}
func (r *LiveReader) fillBuffer() (int, error) {
n, err := r.rdr.Read(r.buf[r.writeIndex:len(r.buf)])
r.writeIndex += n
return n, err
}
// Next returns true if Record() will contain a full record.
// If Next returns false, you should always checked the contents of Error().
// Return false guarantees there are no more records if the segment is closed
// and not corrupt, otherwise if Err() == io.EOF you should try again when more
// data has been written.
func (r *LiveReader) Next() bool {
for {
// If buildRecord returns a non-EOF error, its game up - the segment is
// corrupt. If buildRecord returns an EOF, we try and read more in
// fillBuffer later on. If that fails to read anything (n=0 && err=EOF),
// we return EOF and the user can try again later. If we have a full
// page, buildRecord is guaranteed to return a record or a non-EOF; it
// has checks the records fit in pages.
if ok, err := r.buildRecord(); ok {
return true
} else if err != nil && err != io.EOF {
r.err = err
return false
}
// If we've filled the page and not found a record, this
// means records have started to span pages. Shouldn't happen
// but does and until we found out why, we need to deal with this.
if r.permissive && r.writeIndex == pageSize && r.readIndex > 0 {
copy(r.buf[:], r.buf[r.readIndex:])
r.writeIndex -= r.readIndex
r.readIndex = 0
continue
}
if r.readIndex == pageSize {
r.writeIndex = 0
r.readIndex = 0
}
if r.writeIndex != pageSize {
n, err := r.fillBuffer()
if n == 0 || (err != nil && err != io.EOF) {
r.err = err
return false
}
}
}
}
// Record returns the current record.
// The returned byte slice is only valid until the next call to Next.
func (r *LiveReader) Record() []byte {
return r.rec
}
// Rebuild a full record from potentially partial records. Returns false
// if there was an error or if we weren't able to read a record for any reason.
// Returns true if we read a full record. Any record data is appended to
// LiveReader.rec
func (r *LiveReader) buildRecord() (bool, error) {
for {
// Check that we have data in the internal buffer to read.
if r.writeIndex <= r.readIndex {
return false, nil
}
// Attempt to read a record, partial or otherwise.
temp, n, err := r.readRecord()
if err != nil {
return false, err
}
r.readIndex += n
r.total += int64(n)
if temp == nil {
return false, nil
}
rt := recTypeFromHeader(r.hdr[0])
if rt == recFirst || rt == recFull {
r.rec = r.rec[:0]
r.snappyBuf = r.snappyBuf[:0]
}
compressed := r.hdr[0]&snappyMask != 0
if compressed {
r.snappyBuf = append(r.snappyBuf, temp...)
} else {
r.rec = append(r.rec, temp...)
}
if err := validateRecord(rt, r.index); err != nil {
r.index = 0
return false, err
}
if rt == recLast || rt == recFull {
r.index = 0
if compressed && len(r.snappyBuf) > 0 {
// The snappy library uses `len` to calculate if we need a new buffer.
// In order to allocate as few buffers as possible make the length
// equal to the capacity.
r.rec = r.rec[:cap(r.rec)]
r.rec, err = snappy.Decode(r.rec, r.snappyBuf)
if err != nil {
return false, err
}
}
return true, nil
}
// Only increment i for non-zero records since we use it
// to determine valid content record sequences.
r.index++
}
}
// Returns an error if the recType and i indicate an invalid record sequence.
// As an example, if i is > 0 because we've read some amount of a partial record
// (recFirst, recMiddle, etc. but not recLast) and then we get another recFirst or recFull
// instead of a recLast or recMiddle we would have an invalid record.
func validateRecord(typ recType, i int) error {
switch typ {
case recFull:
if i != 0 {
return errors.New("unexpected full record")
}
return nil
case recFirst:
if i != 0 {
return errors.New("unexpected first record, dropping buffer")
}
return nil
case recMiddle:
if i == 0 {
return errors.New("unexpected middle record, dropping buffer")
}
return nil
case recLast:
if i == 0 {
return errors.New("unexpected last record, dropping buffer")
}
return nil
default:
return errors.Errorf("unexpected record type %d", typ)
}
}
// Read a sub-record (see recType) from the buffer. It could potentially
// be a full record (recFull) if the record fits within the bounds of a single page.
// Returns a byte slice of the record data read, the number of bytes read, and an error
// if there's a non-zero byte in a page term record or the record checksum fails.
// This is a non-method function to make it clear it does not mutate the reader.
func (r *LiveReader) readRecord() ([]byte, int, error) {
// Special case: for recPageTerm, check that are all zeros to end of page,
// consume them but don't return them.
if r.buf[r.readIndex] == byte(recPageTerm) {
// End of page won't necessarily be end of buffer, as we may have
// got misaligned by records spanning page boundaries.
// r.total % pageSize is the offset into the current page
// that r.readIndex points to in buf. Therefore
// pageSize - (r.total % pageSize) is the amount left to read of
// the current page.
remaining := int(pageSize - (r.total % pageSize))
if r.readIndex+remaining > r.writeIndex {
return nil, 0, io.EOF
}
for i := r.readIndex; i < r.readIndex+remaining; i++ {
if r.buf[i] != 0 {
return nil, 0, errors.New("unexpected non-zero byte in page term bytes")
}
}
return nil, remaining, nil
}
// Not a recPageTerm; read the record and check the checksum.
if r.writeIndex-r.readIndex < recordHeaderSize {
return nil, 0, io.EOF
}
copy(r.hdr[:], r.buf[r.readIndex:r.readIndex+recordHeaderSize])
length := int(binary.BigEndian.Uint16(r.hdr[1:]))
crc := binary.BigEndian.Uint32(r.hdr[3:])
if r.readIndex+recordHeaderSize+length > pageSize {
if !r.permissive {
return nil, 0, fmt.Errorf("record would overflow current page: %d > %d", r.readIndex+recordHeaderSize+length, pageSize)
}
r.metrics.readerCorruptionErrors.WithLabelValues("record_span_page").Inc()
level.Warn(r.logger).Log("msg", "record spans page boundaries", "start", r.readIndex, "end", recordHeaderSize+length, "pageSize", pageSize)
}
if recordHeaderSize+length > pageSize {
return nil, 0, fmt.Errorf("record length greater than a single page: %d > %d", recordHeaderSize+length, pageSize)
}
if r.readIndex+recordHeaderSize+length > r.writeIndex {
return nil, 0, io.EOF
}
rec := r.buf[r.readIndex+recordHeaderSize : r.readIndex+recordHeaderSize+length]
if c := crc32.Checksum(rec, castagnoliTable); c != crc {
return nil, 0, errors.Errorf("unexpected checksum %x, expected %x", c, crc)
}
return rec, length + recordHeaderSize, nil
}
func min(i, j int) int {
if i < j {
return i
}
return j
}

200
tsdb/wal/reader.go Normal file
View file

@ -0,0 +1,200 @@
// Copyright 2019 The Prometheus Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"encoding/binary"
"hash/crc32"
"io"
"github.com/golang/snappy"
"github.com/pkg/errors"
)
// Reader reads WAL records from an io.Reader.
type Reader struct {
rdr io.Reader
err error
rec []byte
snappyBuf []byte
buf [pageSize]byte
total int64 // Total bytes processed.
curRecTyp recType // Used for checking that the last record is not torn.
}
// NewReader returns a new reader.
func NewReader(r io.Reader) *Reader {
return &Reader{rdr: r}
}
// Next advances the reader to the next records and returns true if it exists.
// It must not be called again after it returned false.
func (r *Reader) Next() bool {
err := r.next()
if errors.Cause(err) == io.EOF {
// The last WAL segment record shouldn't be torn(should be full or last).
// The last record would be torn after a crash just before
// the last record part could be persisted to disk.
if r.curRecTyp == recFirst || r.curRecTyp == recMiddle {
r.err = errors.New("last record is torn")
}
return false
}
r.err = err
return r.err == nil
}
func (r *Reader) next() (err error) {
// We have to use r.buf since allocating byte arrays here fails escape
// analysis and ends up on the heap, even though it seemingly should not.
hdr := r.buf[:recordHeaderSize]
buf := r.buf[recordHeaderSize:]
r.rec = r.rec[:0]
r.snappyBuf = r.snappyBuf[:0]
i := 0
for {
if _, err = io.ReadFull(r.rdr, hdr[:1]); err != nil {
return errors.Wrap(err, "read first header byte")
}
r.total++
r.curRecTyp = recTypeFromHeader(hdr[0])
compressed := hdr[0]&snappyMask != 0
// Gobble up zero bytes.
if r.curRecTyp == recPageTerm {
// recPageTerm is a single byte that indicates the rest of the page is padded.
// If it's the first byte in a page, buf is too small and
// needs to be resized to fit pageSize-1 bytes.
buf = r.buf[1:]
// We are pedantic and check whether the zeros are actually up
// to a page boundary.
// It's not strictly necessary but may catch sketchy state early.
k := pageSize - (r.total % pageSize)
if k == pageSize {
continue // Initial 0 byte was last page byte.
}
n, err := io.ReadFull(r.rdr, buf[:k])
if err != nil {
return errors.Wrap(err, "read remaining zeros")
}
r.total += int64(n)
for _, c := range buf[:k] {
if c != 0 {
return errors.New("unexpected non-zero byte in padded page")
}
}
continue
}
n, err := io.ReadFull(r.rdr, hdr[1:])
if err != nil {
return errors.Wrap(err, "read remaining header")
}
r.total += int64(n)
var (
length = binary.BigEndian.Uint16(hdr[1:])
crc = binary.BigEndian.Uint32(hdr[3:])
)
if length > pageSize-recordHeaderSize {
return errors.Errorf("invalid record size %d", length)
}
n, err = io.ReadFull(r.rdr, buf[:length])
if err != nil {
return err
}
r.total += int64(n)
if n != int(length) {
return errors.Errorf("invalid size: expected %d, got %d", length, n)
}
if c := crc32.Checksum(buf[:length], castagnoliTable); c != crc {
return errors.Errorf("unexpected checksum %x, expected %x", c, crc)
}
if compressed {
r.snappyBuf = append(r.snappyBuf, buf[:length]...)
} else {
r.rec = append(r.rec, buf[:length]...)
}
if err := validateRecord(r.curRecTyp, i); err != nil {
return err
}
if r.curRecTyp == recLast || r.curRecTyp == recFull {
if compressed && len(r.snappyBuf) > 0 {
// The snappy library uses `len` to calculate if we need a new buffer.
// In order to allocate as few buffers as possible make the length
// equal to the capacity.
r.rec = r.rec[:cap(r.rec)]
r.rec, err = snappy.Decode(r.rec, r.snappyBuf)
return err
}
return nil
}
// Only increment i for non-zero records since we use it
// to determine valid content record sequences.
i++
}
}
// Err returns the last encountered error wrapped in a corruption error.
// If the reader does not allow to infer a segment index and offset, a total
// offset in the reader stream will be provided.
func (r *Reader) Err() error {
if r.err == nil {
return nil
}
if b, ok := r.rdr.(*segmentBufReader); ok {
return &CorruptionErr{
Err: r.err,
Dir: b.segs[b.cur].Dir(),
Segment: b.segs[b.cur].Index(),
Offset: int64(b.off),
}
}
return &CorruptionErr{
Err: r.err,
Segment: -1,
Offset: r.total,
}
}
// Record returns the current record. The returned byte slice is only
// valid until the next call to Next.
func (r *Reader) Record() []byte {
return r.rec
}
// Segment returns the current segment being read.
func (r *Reader) Segment() int {
if b, ok := r.rdr.(*segmentBufReader); ok {
return b.segs[b.cur].Index()
}
return -1
}
// Offset returns the current position of the segment being read.
func (r *Reader) Offset() int64 {
if b, ok := r.rdr.(*segmentBufReader); ok {
return int64(b.off)
}
return r.total
}

549
tsdb/wal/reader_test.go Normal file
View file

@ -0,0 +1,549 @@
// Copyright 2019 The Prometheus Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"bytes"
"encoding/binary"
"fmt"
"hash/crc32"
"io"
"io/ioutil"
"math/rand"
"os"
"path/filepath"
"runtime"
"strconv"
"testing"
"time"
"github.com/go-kit/kit/log"
tsdb_errors "github.com/prometheus/tsdb/errors"
"github.com/prometheus/tsdb/testutil"
)
type reader interface {
Next() bool
Err() error
Record() []byte
Offset() int64
}
type record struct {
t recType
b []byte
}
var readerConstructors = map[string]func(io.Reader) reader{
"Reader": func(r io.Reader) reader {
return NewReader(r)
},
"LiveReader": func(r io.Reader) reader {
lr := NewLiveReader(log.NewNopLogger(), NewLiveReaderMetrics(nil), r)
lr.eofNonErr = true
return lr
},
}
var data = make([]byte, 100000)
var testReaderCases = []struct {
t []record
exp [][]byte
fail bool
}{
// Sequence of valid records.
{
t: []record{
{recFull, data[0:200]},
{recFirst, data[200:300]},
{recLast, data[300:400]},
{recFirst, data[400:800]},
{recMiddle, data[800:900]},
{recPageTerm, make([]byte, pageSize-900-recordHeaderSize*5-1)}, // exactly lines up with page boundary.
{recLast, data[900:900]},
{recFirst, data[900:1000]},
{recMiddle, data[1000:1200]},
{recMiddle, data[1200:30000]},
{recMiddle, data[30000:30001]},
{recMiddle, data[30001:30001]},
{recLast, data[30001:32000]},
},
exp: [][]byte{
data[0:200],
data[200:400],
data[400:900],
data[900:32000],
},
},
// Exactly at the limit of one page minus the header size
{
t: []record{
{recFull, data[0 : pageSize-recordHeaderSize]},
},
exp: [][]byte{
data[:pageSize-recordHeaderSize],
},
},
// More than a full page, this exceeds our buffer and can never happen
// when written by the WAL.
{
t: []record{
{recFull, data[0 : pageSize+1]},
},
fail: true,
},
// Two records the together are too big for a page.
// NB currently the non-live reader succeeds on this. I think this is a bug.
// but we've seen it in production.
{
t: []record{
{recFull, data[:pageSize/2]},
{recFull, data[:pageSize/2]},
},
exp: [][]byte{
data[:pageSize/2],
data[:pageSize/2],
},
},
// Invalid orders of record types.
{
t: []record{{recMiddle, data[:200]}},
fail: true,
},
{
t: []record{{recLast, data[:200]}},
fail: true,
},
{
t: []record{
{recFirst, data[:200]},
{recFull, data[200:400]},
},
fail: true,
},
{
t: []record{
{recFirst, data[:100]},
{recMiddle, data[100:200]},
{recFull, data[200:400]},
},
fail: true,
},
// Non-zero data after page termination.
{
t: []record{
{recFull, data[:100]},
{recPageTerm, append(make([]byte, pageSize-recordHeaderSize-102), 1)},
},
exp: [][]byte{data[:100]},
fail: true,
},
}
func encodedRecord(t recType, b []byte) []byte {
if t == recPageTerm {
return append([]byte{0}, b...)
}
r := make([]byte, recordHeaderSize)
r[0] = byte(t)
binary.BigEndian.PutUint16(r[1:], uint16(len(b)))
binary.BigEndian.PutUint32(r[3:], crc32.Checksum(b, castagnoliTable))
return append(r, b...)
}
// TestReader feeds the reader a stream of encoded records with different types.
func TestReader(t *testing.T) {
for name, fn := range readerConstructors {
for i, c := range testReaderCases {
t.Run(fmt.Sprintf("%s/%d", name, i), func(t *testing.T) {
var buf []byte
for _, r := range c.t {
buf = append(buf, encodedRecord(r.t, r.b)...)
}
r := fn(bytes.NewReader(buf))
for j := 0; r.Next(); j++ {
t.Logf("record %d", j)
rec := r.Record()
if j >= len(c.exp) {
t.Fatal("received more records than expected")
}
testutil.Equals(t, c.exp[j], rec, "Bytes within record did not match expected Bytes")
}
if !c.fail && r.Err() != nil {
t.Fatalf("unexpected error: %s", r.Err())
}
if c.fail && r.Err() == nil {
t.Fatalf("expected error but got none")
}
})
}
}
}
func TestReader_Live(t *testing.T) {
logger := testutil.NewLogger(t)
for i := range testReaderCases {
t.Run(strconv.Itoa(i), func(t *testing.T) {
writeFd, err := ioutil.TempFile("", "TestReader_Live")
testutil.Ok(t, err)
defer os.Remove(writeFd.Name())
go func(i int) {
for _, rec := range testReaderCases[i].t {
rec := encodedRecord(rec.t, rec.b)
_, err := writeFd.Write(rec)
testutil.Ok(t, err)
runtime.Gosched()
}
writeFd.Close()
}(i)
// Read from a second FD on the same file.
readFd, err := os.Open(writeFd.Name())
testutil.Ok(t, err)
reader := NewLiveReader(logger, NewLiveReaderMetrics(nil), readFd)
for _, exp := range testReaderCases[i].exp {
for !reader.Next() {
testutil.Assert(t, reader.Err() == io.EOF, "expect EOF, got: %v", reader.Err())
runtime.Gosched()
}
actual := reader.Record()
testutil.Equals(t, exp, actual, "read wrong record")
}
testutil.Assert(t, !reader.Next(), "unexpected record")
if testReaderCases[i].fail {
testutil.Assert(t, reader.Err() != nil, "expected error")
}
})
}
}
const fuzzLen = 500
func generateRandomEntries(w *WAL, records chan []byte) error {
var recs [][]byte
for i := 0; i < fuzzLen; i++ {
var sz int64
switch i % 5 {
case 0, 1:
sz = 50
case 2, 3:
sz = pageSize
default:
sz = pageSize * 8
}
rec := make([]byte, rand.Int63n(sz))
if _, err := rand.Read(rec); err != nil {
return err
}
records <- rec
// Randomly batch up records.
recs = append(recs, rec)
if rand.Intn(4) < 3 {
if err := w.Log(recs...); err != nil {
return err
}
recs = recs[:0]
}
}
return w.Log(recs...)
}
type multiReadCloser struct {
reader io.Reader
closers []io.Closer
}
func (m *multiReadCloser) Read(p []byte) (n int, err error) {
return m.reader.Read(p)
}
func (m *multiReadCloser) Close() error {
var merr tsdb_errors.MultiError
for _, closer := range m.closers {
merr.Add(closer.Close())
}
return merr.Err()
}
func allSegments(dir string) (io.ReadCloser, error) {
seg, err := listSegments(dir)
if err != nil {
return nil, err
}
var readers []io.Reader
var closers []io.Closer
for _, r := range seg {
f, err := os.Open(filepath.Join(dir, r.name))
if err != nil {
return nil, err
}
readers = append(readers, f)
closers = append(closers, f)
}
return &multiReadCloser{
reader: io.MultiReader(readers...),
closers: closers,
}, nil
}
func TestReaderFuzz(t *testing.T) {
for name, fn := range readerConstructors {
for _, compress := range []bool{false, true} {
t.Run(fmt.Sprintf("%s,compress=%t", name, compress), func(t *testing.T) {
dir, err := ioutil.TempDir("", "wal_fuzz_live")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
w, err := NewSize(nil, nil, dir, 128*pageSize, compress)
testutil.Ok(t, err)
// Buffering required as we're not reading concurrently.
input := make(chan []byte, fuzzLen)
err = generateRandomEntries(w, input)
testutil.Ok(t, err)
close(input)
err = w.Close()
testutil.Ok(t, err)
sr, err := allSegments(w.Dir())
testutil.Ok(t, err)
defer sr.Close()
reader := fn(sr)
for expected := range input {
testutil.Assert(t, reader.Next(), "expected record: %v", reader.Err())
testutil.Equals(t, expected, reader.Record(), "read wrong record")
}
testutil.Assert(t, !reader.Next(), "unexpected record")
})
}
}
}
func TestReaderFuzz_Live(t *testing.T) {
logger := testutil.NewLogger(t)
for _, compress := range []bool{false, true} {
t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) {
dir, err := ioutil.TempDir("", "wal_fuzz_live")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
w, err := NewSize(nil, nil, dir, 128*pageSize, compress)
testutil.Ok(t, err)
defer w.Close()
// In the background, generate a stream of random records and write them
// to the WAL.
input := make(chan []byte, fuzzLen/10) // buffering required as we sometimes batch WAL writes.
done := make(chan struct{})
go func() {
err := generateRandomEntries(w, input)
testutil.Ok(t, err)
time.Sleep(100 * time.Millisecond)
close(done)
}()
// Tail the WAL and compare the results.
m, _, err := w.Segments()
testutil.Ok(t, err)
seg, err := OpenReadSegment(SegmentName(dir, m))
testutil.Ok(t, err)
defer seg.Close()
r := NewLiveReader(logger, nil, seg)
segmentTicker := time.NewTicker(100 * time.Millisecond)
readTicker := time.NewTicker(10 * time.Millisecond)
readSegment := func(r *LiveReader) bool {
for r.Next() {
rec := r.Record()
expected, ok := <-input
testutil.Assert(t, ok, "unexpected record")
testutil.Equals(t, expected, rec, "record does not match expected")
}
testutil.Assert(t, r.Err() == io.EOF, "expected EOF, got: %v", r.Err())
return true
}
outer:
for {
select {
case <-segmentTicker.C:
// check if new segments exist
_, last, err := w.Segments()
testutil.Ok(t, err)
if last <= seg.i {
continue
}
// read to end of segment.
readSegment(r)
fi, err := os.Stat(SegmentName(dir, seg.i))
testutil.Ok(t, err)
testutil.Assert(t, r.Offset() == fi.Size(), "expected to have read whole segment, but read %d of %d", r.Offset(), fi.Size())
seg, err = OpenReadSegment(SegmentName(dir, seg.i+1))
testutil.Ok(t, err)
defer seg.Close()
r = NewLiveReader(logger, nil, seg)
case <-readTicker.C:
readSegment(r)
case <-done:
readSegment(r)
break outer
}
}
testutil.Assert(t, r.Err() == io.EOF, "expected EOF")
})
}
}
func TestLiveReaderCorrupt_ShortFile(t *testing.T) {
// Write a corrupt WAL segment, there is one record of pageSize in length,
// but the segment is only half written.
logger := testutil.NewLogger(t)
dir, err := ioutil.TempDir("", "wal_live_corrupt")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
w, err := NewSize(nil, nil, dir, pageSize, false)
testutil.Ok(t, err)
rec := make([]byte, pageSize-recordHeaderSize)
_, err = rand.Read(rec)
testutil.Ok(t, err)
err = w.Log(rec)
testutil.Ok(t, err)
err = w.Close()
testutil.Ok(t, err)
segmentFile, err := os.OpenFile(filepath.Join(dir, "00000000"), os.O_RDWR, 0666)
testutil.Ok(t, err)
err = segmentFile.Truncate(pageSize / 2)
testutil.Ok(t, err)
err = segmentFile.Close()
testutil.Ok(t, err)
// Try and LiveReader it.
m, _, err := w.Segments()
testutil.Ok(t, err)
seg, err := OpenReadSegment(SegmentName(dir, m))
testutil.Ok(t, err)
defer seg.Close()
r := NewLiveReader(logger, nil, seg)
testutil.Assert(t, r.Next() == false, "expected no records")
testutil.Assert(t, r.Err() == io.EOF, "expected error, got: %v", r.Err())
}
func TestLiveReaderCorrupt_RecordTooLongAndShort(t *testing.T) {
// Write a corrupt WAL segment, when record len > page size.
logger := testutil.NewLogger(t)
dir, err := ioutil.TempDir("", "wal_live_corrupt")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
w, err := NewSize(nil, nil, dir, pageSize*2, false)
testutil.Ok(t, err)
rec := make([]byte, pageSize-recordHeaderSize)
_, err = rand.Read(rec)
testutil.Ok(t, err)
err = w.Log(rec)
testutil.Ok(t, err)
err = w.Close()
testutil.Ok(t, err)
segmentFile, err := os.OpenFile(filepath.Join(dir, "00000000"), os.O_RDWR, 0666)
testutil.Ok(t, err)
// Override the record length
buf := make([]byte, 3)
buf[0] = byte(recFull)
binary.BigEndian.PutUint16(buf[1:], 0xFFFF)
_, err = segmentFile.WriteAt(buf, 0)
testutil.Ok(t, err)
err = segmentFile.Close()
testutil.Ok(t, err)
// Try and LiveReader it.
m, _, err := w.Segments()
testutil.Ok(t, err)
seg, err := OpenReadSegment(SegmentName(dir, m))
testutil.Ok(t, err)
defer seg.Close()
r := NewLiveReader(logger, NewLiveReaderMetrics(nil), seg)
testutil.Assert(t, r.Next() == false, "expected no records")
testutil.Assert(t, r.Err().Error() == "record length greater than a single page: 65542 > 32768", "expected error, got: %v", r.Err())
}
func TestReaderData(t *testing.T) {
dir := os.Getenv("WALDIR")
if dir == "" {
return
}
for name, fn := range readerConstructors {
t.Run(name, func(t *testing.T) {
w, err := New(nil, nil, dir, true)
testutil.Ok(t, err)
sr, err := allSegments(dir)
testutil.Ok(t, err)
reader := fn(sr)
for reader.Next() {
}
testutil.Ok(t, reader.Err())
err = w.Repair(reader.Err())
testutil.Ok(t, err)
})
}
}

856
tsdb/wal/wal.go Normal file
View file

@ -0,0 +1,856 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"bufio"
"encoding/binary"
"fmt"
"hash/crc32"
"io"
"os"
"path/filepath"
"sort"
"strconv"
"sync"
"time"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/golang/snappy"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/tsdb/fileutil"
)
const (
DefaultSegmentSize = 128 * 1024 * 1024 // 128 MB
pageSize = 32 * 1024 // 32KB
recordHeaderSize = 7
)
// The table gets initialized with sync.Once but may still cause a race
// with any other use of the crc32 package anywhere. Thus we initialize it
// before.
var castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
// page is an in memory buffer used to batch disk writes.
// Records bigger than the page size are split and flushed separately.
// A flush is triggered when a single records doesn't fit the page size or
// when the next record can't fit in the remaining free page space.
type page struct {
alloc int
flushed int
buf [pageSize]byte
}
func (p *page) remaining() int {
return pageSize - p.alloc
}
func (p *page) full() bool {
return pageSize-p.alloc < recordHeaderSize
}
// Segment represents a segment file.
type Segment struct {
*os.File
dir string
i int
}
// Index returns the index of the segment.
func (s *Segment) Index() int {
return s.i
}
// Dir returns the directory of the segment.
func (s *Segment) Dir() string {
return s.dir
}
// CorruptionErr is an error that's returned when corruption is encountered.
type CorruptionErr struct {
Dir string
Segment int
Offset int64
Err error
}
func (e *CorruptionErr) Error() string {
if e.Segment < 0 {
return fmt.Sprintf("corruption after %d bytes: %s", e.Offset, e.Err)
}
return fmt.Sprintf("corruption in segment %s at %d: %s", SegmentName(e.Dir, e.Segment), e.Offset, e.Err)
}
// OpenWriteSegment opens segment k in dir. The returned segment is ready for new appends.
func OpenWriteSegment(logger log.Logger, dir string, k int) (*Segment, error) {
segName := SegmentName(dir, k)
f, err := os.OpenFile(segName, os.O_WRONLY|os.O_APPEND, 0666)
if err != nil {
return nil, err
}
stat, err := f.Stat()
if err != nil {
f.Close()
return nil, err
}
// If the last page is torn, fill it with zeros.
// In case it was torn after all records were written successfully, this
// will just pad the page and everything will be fine.
// If it was torn mid-record, a full read (which the caller should do anyway
// to ensure integrity) will detect it as a corruption by the end.
if d := stat.Size() % pageSize; d != 0 {
level.Warn(logger).Log("msg", "last page of the wal is torn, filling it with zeros", "segment", segName)
if _, err := f.Write(make([]byte, pageSize-d)); err != nil {
f.Close()
return nil, errors.Wrap(err, "zero-pad torn page")
}
}
return &Segment{File: f, i: k, dir: dir}, nil
}
// CreateSegment creates a new segment k in dir.
func CreateSegment(dir string, k int) (*Segment, error) {
f, err := os.OpenFile(SegmentName(dir, k), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
return nil, err
}
return &Segment{File: f, i: k, dir: dir}, nil
}
// OpenReadSegment opens the segment with the given filename.
func OpenReadSegment(fn string) (*Segment, error) {
k, err := strconv.Atoi(filepath.Base(fn))
if err != nil {
return nil, errors.New("not a valid filename")
}
f, err := os.Open(fn)
if err != nil {
return nil, err
}
return &Segment{File: f, i: k, dir: filepath.Dir(fn)}, nil
}
// WAL is a write ahead log that stores records in segment files.
// It must be read from start to end once before logging new data.
// If an error occurs during read, the repair procedure must be called
// before it's safe to do further writes.
//
// Segments are written to in pages of 32KB, with records possibly split
// across page boundaries.
// Records are never split across segments to allow full segments to be
// safely truncated. It also ensures that torn writes never corrupt records
// beyond the most recent segment.
type WAL struct {
dir string
logger log.Logger
segmentSize int
mtx sync.RWMutex
segment *Segment // Active segment.
donePages int // Pages written to the segment.
page *page // Active page.
stopc chan chan struct{}
actorc chan func()
closed bool // To allow calling Close() more than once without blocking.
compress bool
snappyBuf []byte
fsyncDuration prometheus.Summary
pageFlushes prometheus.Counter
pageCompletions prometheus.Counter
truncateFail prometheus.Counter
truncateTotal prometheus.Counter
currentSegment prometheus.Gauge
}
// New returns a new WAL over the given directory.
func New(logger log.Logger, reg prometheus.Registerer, dir string, compress bool) (*WAL, error) {
return NewSize(logger, reg, dir, DefaultSegmentSize, compress)
}
// NewSize returns a new WAL over the given directory.
// New segments are created with the specified size.
func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSize int, compress bool) (*WAL, error) {
if segmentSize%pageSize != 0 {
return nil, errors.New("invalid segment size")
}
if err := os.MkdirAll(dir, 0777); err != nil {
return nil, errors.Wrap(err, "create dir")
}
if logger == nil {
logger = log.NewNopLogger()
}
w := &WAL{
dir: dir,
logger: logger,
segmentSize: segmentSize,
page: &page{},
actorc: make(chan func(), 100),
stopc: make(chan chan struct{}),
compress: compress,
}
registerMetrics(reg, w)
_, j, err := w.Segments()
// Index of the Segment we want to open and write to.
writeSegmentIndex := 0
if err != nil {
return nil, errors.Wrap(err, "get segment range")
}
// If some segments already exist create one with a higher index than the last segment.
if j != -1 {
writeSegmentIndex = j + 1
}
segment, err := CreateSegment(w.dir, writeSegmentIndex)
if err != nil {
return nil, err
}
if err := w.setSegment(segment); err != nil {
return nil, err
}
go w.run()
return w, nil
}
// Open an existing WAL.
func Open(logger log.Logger, reg prometheus.Registerer, dir string) (*WAL, error) {
if logger == nil {
logger = log.NewNopLogger()
}
w := &WAL{
dir: dir,
logger: logger,
}
registerMetrics(reg, w)
return w, nil
}
func registerMetrics(reg prometheus.Registerer, w *WAL) {
w.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{
Name: "prometheus_tsdb_wal_fsync_duration_seconds",
Help: "Duration of WAL fsync.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
})
w.pageFlushes = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_page_flushes_total",
Help: "Total number of page flushes.",
})
w.pageCompletions = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_completed_pages_total",
Help: "Total number of completed pages.",
})
w.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_truncations_failed_total",
Help: "Total number of WAL truncations that failed.",
})
w.truncateTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_truncations_total",
Help: "Total number of WAL truncations attempted.",
})
w.currentSegment = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "prometheus_tsdb_wal_segment_current",
Help: "WAL segment index that TSDB is currently writing to.",
})
if reg != nil {
reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions, w.truncateFail, w.truncateTotal, w.currentSegment)
}
}
// CompressionEnabled returns if compression is enabled on this WAL.
func (w *WAL) CompressionEnabled() bool {
return w.compress
}
// Dir returns the directory of the WAL.
func (w *WAL) Dir() string {
return w.dir
}
func (w *WAL) run() {
Loop:
for {
select {
case f := <-w.actorc:
f()
case donec := <-w.stopc:
close(w.actorc)
defer close(donec)
break Loop
}
}
// Drain and process any remaining functions.
for f := range w.actorc {
f()
}
}
// Repair attempts to repair the WAL based on the error.
// It discards all data after the corruption.
func (w *WAL) Repair(origErr error) error {
// We could probably have a mode that only discards torn records right around
// the corruption to preserve as data much as possible.
// But that's not generally applicable if the records have any kind of causality.
// Maybe as an extra mode in the future if mid-WAL corruptions become
// a frequent concern.
err := errors.Cause(origErr) // So that we can pick up errors even if wrapped.
cerr, ok := err.(*CorruptionErr)
if !ok {
return errors.Wrap(origErr, "cannot handle error")
}
if cerr.Segment < 0 {
return errors.New("corruption error does not specify position")
}
level.Warn(w.logger).Log("msg", "starting corruption repair",
"segment", cerr.Segment, "offset", cerr.Offset)
// All segments behind the corruption can no longer be used.
segs, err := listSegments(w.dir)
if err != nil {
return errors.Wrap(err, "list segments")
}
level.Warn(w.logger).Log("msg", "deleting all segments newer than corrupted segment", "segment", cerr.Segment)
for _, s := range segs {
if w.segment.i == s.index {
// The active segment needs to be removed,
// close it first (Windows!). Can be closed safely
// as we set the current segment to repaired file
// below.
if err := w.segment.Close(); err != nil {
return errors.Wrap(err, "close active segment")
}
}
if s.index <= cerr.Segment {
continue
}
if err := os.Remove(filepath.Join(w.dir, s.name)); err != nil {
return errors.Wrapf(err, "delete segment:%v", s.index)
}
}
// Regardless of the corruption offset, no record reaches into the previous segment.
// So we can safely repair the WAL by removing the segment and re-inserting all
// its records up to the corruption.
level.Warn(w.logger).Log("msg", "rewrite corrupted segment", "segment", cerr.Segment)
fn := SegmentName(w.dir, cerr.Segment)
tmpfn := fn + ".repair"
if err := fileutil.Rename(fn, tmpfn); err != nil {
return err
}
// Create a clean segment and make it the active one.
s, err := CreateSegment(w.dir, cerr.Segment)
if err != nil {
return err
}
if err := w.setSegment(s); err != nil {
return err
}
f, err := os.Open(tmpfn)
if err != nil {
return errors.Wrap(err, "open segment")
}
defer f.Close()
r := NewReader(bufio.NewReader(f))
for r.Next() {
// Add records only up to the where the error was.
if r.Offset() >= cerr.Offset {
break
}
if err := w.Log(r.Record()); err != nil {
return errors.Wrap(err, "insert record")
}
}
// We expect an error here from r.Err(), so nothing to handle.
// We need to pad to the end of the last page in the repaired segment
w.flushPage(true)
// We explicitly close even when there is a defer for Windows to be
// able to delete it. The defer is in place to close it in-case there
// are errors above.
if err := f.Close(); err != nil {
return errors.Wrap(err, "close corrupted file")
}
if err := os.Remove(tmpfn); err != nil {
return errors.Wrap(err, "delete corrupted segment")
}
// Explicitly close the the segment we just repaired to avoid issues with Windows.
s.Close()
// We always want to start writing to a new Segment rather than an existing
// Segment, which is handled by NewSize, but earlier in Repair we're deleting
// all segments that come after the corrupted Segment. Recreate a new Segment here.
s, err = CreateSegment(w.dir, cerr.Segment+1)
if err != nil {
return err
}
if err := w.setSegment(s); err != nil {
return err
}
return nil
}
// SegmentName builds a segment name for the directory.
func SegmentName(dir string, i int) string {
return filepath.Join(dir, fmt.Sprintf("%08d", i))
}
// NextSegment creates the next segment and closes the previous one.
func (w *WAL) NextSegment() error {
w.mtx.Lock()
defer w.mtx.Unlock()
return w.nextSegment()
}
// nextSegment creates the next segment and closes the previous one.
func (w *WAL) nextSegment() error {
// Only flush the current page if it actually holds data.
if w.page.alloc > 0 {
if err := w.flushPage(true); err != nil {
return err
}
}
next, err := CreateSegment(w.dir, w.segment.Index()+1)
if err != nil {
return errors.Wrap(err, "create new segment file")
}
prev := w.segment
if err := w.setSegment(next); err != nil {
return err
}
// Don't block further writes by fsyncing the last segment.
w.actorc <- func() {
if err := w.fsync(prev); err != nil {
level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
}
if err := prev.Close(); err != nil {
level.Error(w.logger).Log("msg", "close previous segment", "err", err)
}
}
return nil
}
func (w *WAL) setSegment(segment *Segment) error {
w.segment = segment
// Correctly initialize donePages.
stat, err := segment.Stat()
if err != nil {
return err
}
w.donePages = int(stat.Size() / pageSize)
w.currentSegment.Set(float64(segment.Index()))
return nil
}
// flushPage writes the new contents of the page to disk. If no more records will fit into
// the page, the remaining bytes will be set to zero and a new page will be started.
// If clear is true, this is enforced regardless of how many bytes are left in the page.
func (w *WAL) flushPage(clear bool) error {
w.pageFlushes.Inc()
p := w.page
clear = clear || p.full()
// No more data will fit into the page or an implicit clear.
// Enqueue and clear it.
if clear {
p.alloc = pageSize // Write till end of page.
}
n, err := w.segment.Write(p.buf[p.flushed:p.alloc])
if err != nil {
return err
}
p.flushed += n
// We flushed an entire page, prepare a new one.
if clear {
for i := range p.buf {
p.buf[i] = 0
}
p.alloc = 0
p.flushed = 0
w.donePages++
w.pageCompletions.Inc()
}
return nil
}
// First Byte of header format:
// [ 4 bits unallocated] [1 bit snappy compression flag] [ 3 bit record type ]
const (
snappyMask = 1 << 3
recTypeMask = snappyMask - 1
)
type recType uint8
const (
recPageTerm recType = 0 // Rest of page is empty.
recFull recType = 1 // Full record.
recFirst recType = 2 // First fragment of a record.
recMiddle recType = 3 // Middle fragments of a record.
recLast recType = 4 // Final fragment of a record.
)
func recTypeFromHeader(header byte) recType {
return recType(header & recTypeMask)
}
func (t recType) String() string {
switch t {
case recPageTerm:
return "zero"
case recFull:
return "full"
case recFirst:
return "first"
case recMiddle:
return "middle"
case recLast:
return "last"
default:
return "<invalid>"
}
}
func (w *WAL) pagesPerSegment() int {
return w.segmentSize / pageSize
}
// Log writes the records into the log.
// Multiple records can be passed at once to reduce writes and increase throughput.
func (w *WAL) Log(recs ...[]byte) error {
w.mtx.Lock()
defer w.mtx.Unlock()
// Callers could just implement their own list record format but adding
// a bit of extra logic here frees them from that overhead.
for i, r := range recs {
if err := w.log(r, i == len(recs)-1); err != nil {
return err
}
}
return nil
}
// log writes rec to the log and forces a flush of the current page if:
// - the final record of a batch
// - the record is bigger than the page size
// - the current page is full.
func (w *WAL) log(rec []byte, final bool) error {
// When the last page flush failed the page will remain full.
// When the page is full, need to flush it before trying to add more records to it.
if w.page.full() {
if err := w.flushPage(true); err != nil {
return err
}
}
// If the record is too big to fit within the active page in the current
// segment, terminate the active segment and advance to the next one.
// This ensures that records do not cross segment boundaries.
left := w.page.remaining() - recordHeaderSize // Free space in the active page.
left += (pageSize - recordHeaderSize) * (w.pagesPerSegment() - w.donePages - 1) // Free pages in the active segment.
if len(rec) > left {
if err := w.nextSegment(); err != nil {
return err
}
}
compressed := false
if w.compress && len(rec) > 0 {
// The snappy library uses `len` to calculate if we need a new buffer.
// In order to allocate as few buffers as possible make the length
// equal to the capacity.
w.snappyBuf = w.snappyBuf[:cap(w.snappyBuf)]
w.snappyBuf = snappy.Encode(w.snappyBuf, rec)
if len(w.snappyBuf) < len(rec) {
rec = w.snappyBuf
compressed = true
}
}
// Populate as many pages as necessary to fit the record.
// Be careful to always do one pass to ensure we write zero-length records.
for i := 0; i == 0 || len(rec) > 0; i++ {
p := w.page
// Find how much of the record we can fit into the page.
var (
l = min(len(rec), (pageSize-p.alloc)-recordHeaderSize)
part = rec[:l]
buf = p.buf[p.alloc:]
typ recType
)
switch {
case i == 0 && len(part) == len(rec):
typ = recFull
case len(part) == len(rec):
typ = recLast
case i == 0:
typ = recFirst
default:
typ = recMiddle
}
if compressed {
typ |= snappyMask
}
buf[0] = byte(typ)
crc := crc32.Checksum(part, castagnoliTable)
binary.BigEndian.PutUint16(buf[1:], uint16(len(part)))
binary.BigEndian.PutUint32(buf[3:], crc)
copy(buf[recordHeaderSize:], part)
p.alloc += len(part) + recordHeaderSize
if w.page.full() {
if err := w.flushPage(true); err != nil {
return err
}
}
rec = rec[l:]
}
// If it's the final record of the batch and the page is not empty, flush it.
if final && w.page.alloc > 0 {
if err := w.flushPage(false); err != nil {
return err
}
}
return nil
}
// Segments returns the range [first, n] of currently existing segments.
// If no segments are found, first and n are -1.
func (w *WAL) Segments() (first, last int, err error) {
refs, err := listSegments(w.dir)
if err != nil {
return 0, 0, err
}
if len(refs) == 0 {
return -1, -1, nil
}
return refs[0].index, refs[len(refs)-1].index, nil
}
// Truncate drops all segments before i.
func (w *WAL) Truncate(i int) (err error) {
w.truncateTotal.Inc()
defer func() {
if err != nil {
w.truncateFail.Inc()
}
}()
refs, err := listSegments(w.dir)
if err != nil {
return err
}
for _, r := range refs {
if r.index >= i {
break
}
if err = os.Remove(filepath.Join(w.dir, r.name)); err != nil {
return err
}
}
return nil
}
func (w *WAL) fsync(f *Segment) error {
start := time.Now()
err := f.File.Sync()
w.fsyncDuration.Observe(time.Since(start).Seconds())
return err
}
// Close flushes all writes and closes active segment.
func (w *WAL) Close() (err error) {
w.mtx.Lock()
defer w.mtx.Unlock()
if w.closed {
return errors.New("wal already closed")
}
// Flush the last page and zero out all its remaining size.
// We must not flush an empty page as it would falsely signal
// the segment is done if we start writing to it again after opening.
if w.page.alloc > 0 {
if err := w.flushPage(true); err != nil {
return err
}
}
donec := make(chan struct{})
w.stopc <- donec
<-donec
if err = w.fsync(w.segment); err != nil {
level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
}
if err := w.segment.Close(); err != nil {
level.Error(w.logger).Log("msg", "close previous segment", "err", err)
}
w.closed = true
return nil
}
type segmentRef struct {
name string
index int
}
func listSegments(dir string) (refs []segmentRef, err error) {
files, err := fileutil.ReadDir(dir)
if err != nil {
return nil, err
}
var last int
for _, fn := range files {
k, err := strconv.Atoi(fn)
if err != nil {
continue
}
if len(refs) > 0 && k > last+1 {
return nil, errors.New("segments are not sequential")
}
refs = append(refs, segmentRef{name: fn, index: k})
last = k
}
sort.Slice(refs, func(i, j int) bool {
return refs[i].index < refs[j].index
})
return refs, nil
}
// SegmentRange groups segments by the directory and the first and last index it includes.
type SegmentRange struct {
Dir string
First, Last int
}
// NewSegmentsReader returns a new reader over all segments in the directory.
func NewSegmentsReader(dir string) (io.ReadCloser, error) {
return NewSegmentsRangeReader(SegmentRange{dir, -1, -1})
}
// NewSegmentsRangeReader returns a new reader over the given WAL segment ranges.
// If first or last are -1, the range is open on the respective end.
func NewSegmentsRangeReader(sr ...SegmentRange) (io.ReadCloser, error) {
var segs []*Segment
for _, sgmRange := range sr {
refs, err := listSegments(sgmRange.Dir)
if err != nil {
return nil, errors.Wrapf(err, "list segment in dir:%v", sgmRange.Dir)
}
for _, r := range refs {
if sgmRange.First >= 0 && r.index < sgmRange.First {
continue
}
if sgmRange.Last >= 0 && r.index > sgmRange.Last {
break
}
s, err := OpenReadSegment(filepath.Join(sgmRange.Dir, r.name))
if err != nil {
return nil, errors.Wrapf(err, "open segment:%v in dir:%v", r.name, sgmRange.Dir)
}
segs = append(segs, s)
}
}
return NewSegmentBufReader(segs...), nil
}
// segmentBufReader is a buffered reader that reads in multiples of pages.
// The main purpose is that we are able to track segment and offset for
// corruption reporting. We have to be careful not to increment curr too
// early, as it is used by Reader.Err() to tell Repair which segment is corrupt.
// As such we pad the end of non-page align segments with zeros.
type segmentBufReader struct {
buf *bufio.Reader
segs []*Segment
cur int // Index into segs.
off int // Offset of read data into current segment.
}
func NewSegmentBufReader(segs ...*Segment) *segmentBufReader {
return &segmentBufReader{
buf: bufio.NewReaderSize(segs[0], 16*pageSize),
segs: segs,
}
}
func (r *segmentBufReader) Close() (err error) {
for _, s := range r.segs {
if e := s.Close(); e != nil {
err = e
}
}
return err
}
// Read implements io.Reader.
func (r *segmentBufReader) Read(b []byte) (n int, err error) {
n, err = r.buf.Read(b)
r.off += n
// If we succeeded, or hit a non-EOF, we can stop.
if err == nil || err != io.EOF {
return n, err
}
// We hit EOF; fake out zero padding at the end of short segments, so we
// don't increment curr too early and report the wrong segment as corrupt.
if r.off%pageSize != 0 {
i := 0
for ; n+i < len(b) && (r.off+i)%pageSize != 0; i++ {
b[n+i] = 0
}
// Return early, even if we didn't fill b.
r.off += i
return n + i, nil
}
// There is no more deta left in the curr segment and there are no more
// segments left. Return EOF.
if r.cur+1 >= len(r.segs) {
return n, io.EOF
}
// Move to next segment.
r.cur++
r.off = 0
r.buf.Reset(r.segs[r.cur])
return n, nil
}

477
tsdb/wal/wal_test.go Normal file
View file

@ -0,0 +1,477 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"bytes"
"fmt"
"io/ioutil"
"math/rand"
"os"
"path/filepath"
"testing"
client_testutil "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/tsdb/testutil"
)
// TestWALRepair_ReadingError ensures that a repair is run for an error
// when reading a record.
func TestWALRepair_ReadingError(t *testing.T) {
for name, test := range map[string]struct {
corrSgm int // Which segment to corrupt.
corrFunc func(f *os.File) // Func that applies the corruption.
intactRecs int // Total expected records left after the repair.
}{
"torn_last_record": {
2,
func(f *os.File) {
_, err := f.Seek(pageSize*2, 0)
testutil.Ok(t, err)
_, err = f.Write([]byte{byte(recFirst)})
testutil.Ok(t, err)
},
8,
},
// Ensures that the page buffer is big enough to fit
// an entire page size without panicing.
// https://github.com/prometheus/tsdb/pull/414
"bad_header": {
1,
func(f *os.File) {
_, err := f.Seek(pageSize, 0)
testutil.Ok(t, err)
_, err = f.Write([]byte{byte(recPageTerm)})
testutil.Ok(t, err)
},
4,
},
"bad_fragment_sequence": {
1,
func(f *os.File) {
_, err := f.Seek(pageSize, 0)
testutil.Ok(t, err)
_, err = f.Write([]byte{byte(recLast)})
testutil.Ok(t, err)
},
4,
},
"bad_fragment_flag": {
1,
func(f *os.File) {
_, err := f.Seek(pageSize, 0)
testutil.Ok(t, err)
_, err = f.Write([]byte{123})
testutil.Ok(t, err)
},
4,
},
"bad_checksum": {
1,
func(f *os.File) {
_, err := f.Seek(pageSize+4, 0)
testutil.Ok(t, err)
_, err = f.Write([]byte{0})
testutil.Ok(t, err)
},
4,
},
"bad_length": {
1,
func(f *os.File) {
_, err := f.Seek(pageSize+2, 0)
testutil.Ok(t, err)
_, err = f.Write([]byte{0})
testutil.Ok(t, err)
},
4,
},
"bad_content": {
1,
func(f *os.File) {
_, err := f.Seek(pageSize+100, 0)
testutil.Ok(t, err)
_, err = f.Write([]byte("beef"))
testutil.Ok(t, err)
},
4,
},
} {
t.Run(name, func(t *testing.T) {
dir, err := ioutil.TempDir("", "wal_repair")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
// We create 3 segments with 3 records each and
// then corrupt a given record in a given segment.
// As a result we want a repaired WAL with given intact records.
segSize := 3 * pageSize
w, err := NewSize(nil, nil, dir, segSize, false)
testutil.Ok(t, err)
var records [][]byte
for i := 1; i <= 9; i++ {
b := make([]byte, pageSize-recordHeaderSize)
b[0] = byte(i)
records = append(records, b)
testutil.Ok(t, w.Log(b))
}
first, last, err := w.Segments()
testutil.Ok(t, err)
testutil.Equals(t, 3, 1+last-first, "wal creation didn't result in expected number of segments")
testutil.Ok(t, w.Close())
f, err := os.OpenFile(SegmentName(dir, test.corrSgm), os.O_RDWR, 0666)
testutil.Ok(t, err)
// Apply corruption function.
test.corrFunc(f)
testutil.Ok(t, f.Close())
w, err = NewSize(nil, nil, dir, segSize, false)
testutil.Ok(t, err)
defer w.Close()
first, last, err = w.Segments()
testutil.Ok(t, err)
// Backfill segments from the most recent checkpoint onwards.
for i := first; i <= last; i++ {
s, err := OpenReadSegment(SegmentName(w.Dir(), i))
testutil.Ok(t, err)
sr := NewSegmentBufReader(s)
testutil.Ok(t, err)
r := NewReader(sr)
for r.Next() {
}
//Close the segment so we don't break things on Windows.
s.Close()
// No corruption in this segment.
if r.Err() == nil {
continue
}
testutil.Ok(t, w.Repair(r.Err()))
break
}
sr, err := NewSegmentsReader(dir)
testutil.Ok(t, err)
defer sr.Close()
r := NewReader(sr)
var result [][]byte
for r.Next() {
var b []byte
result = append(result, append(b, r.Record()...))
}
testutil.Ok(t, r.Err())
testutil.Equals(t, test.intactRecs, len(result), "Wrong number of intact records")
for i, r := range result {
if !bytes.Equal(records[i], r) {
t.Fatalf("record %d diverges: want %x, got %x", i, records[i][:10], r[:10])
}
}
// Make sure there is a new 0 size Segment after the corrupted Segment.
_, last, err = w.Segments()
testutil.Ok(t, err)
testutil.Equals(t, test.corrSgm+1, last)
fi, err := os.Stat(SegmentName(dir, last))
testutil.Ok(t, err)
testutil.Equals(t, int64(0), fi.Size())
})
}
}
// TestCorruptAndCarryOn writes a multi-segment WAL; corrupts the first segment and
// ensures that an error during reading that segment are correctly repaired before
// moving to write more records to the WAL.
func TestCorruptAndCarryOn(t *testing.T) {
dir, err := ioutil.TempDir("", "wal_repair")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
var (
logger = testutil.NewLogger(t)
segmentSize = pageSize * 3
recordSize = (pageSize / 3) - recordHeaderSize
)
// Produce a WAL with a two segments of 3 pages with 3 records each,
// so when we truncate the file we're guaranteed to split a record.
{
w, err := NewSize(logger, nil, dir, segmentSize, false)
testutil.Ok(t, err)
for i := 0; i < 18; i++ {
buf := make([]byte, recordSize)
_, err := rand.Read(buf)
testutil.Ok(t, err)
err = w.Log(buf)
testutil.Ok(t, err)
}
err = w.Close()
testutil.Ok(t, err)
}
// Check all the segments are the correct size.
{
segments, err := listSegments(dir)
testutil.Ok(t, err)
for _, segment := range segments {
f, err := os.OpenFile(filepath.Join(dir, fmt.Sprintf("%08d", segment.index)), os.O_RDONLY, 0666)
testutil.Ok(t, err)
fi, err := f.Stat()
testutil.Ok(t, err)
t.Log("segment", segment.index, "size", fi.Size())
testutil.Equals(t, int64(segmentSize), fi.Size())
err = f.Close()
testutil.Ok(t, err)
}
}
// Truncate the first file, splitting the middle record in the second
// page in half, leaving 4 valid records.
{
f, err := os.OpenFile(filepath.Join(dir, fmt.Sprintf("%08d", 0)), os.O_RDWR, 0666)
testutil.Ok(t, err)
fi, err := f.Stat()
testutil.Ok(t, err)
testutil.Equals(t, int64(segmentSize), fi.Size())
err = f.Truncate(int64(segmentSize / 2))
testutil.Ok(t, err)
err = f.Close()
testutil.Ok(t, err)
}
// Now try and repair this WAL, and write 5 more records to it.
{
sr, err := NewSegmentsReader(dir)
testutil.Ok(t, err)
reader := NewReader(sr)
i := 0
for ; i < 4 && reader.Next(); i++ {
testutil.Equals(t, recordSize, len(reader.Record()))
}
testutil.Equals(t, 4, i, "not enough records")
testutil.Assert(t, !reader.Next(), "unexpected record")
corruptionErr := reader.Err()
testutil.Assert(t, corruptionErr != nil, "expected error")
err = sr.Close()
testutil.Ok(t, err)
w, err := NewSize(logger, nil, dir, segmentSize, false)
testutil.Ok(t, err)
err = w.Repair(corruptionErr)
testutil.Ok(t, err)
// Ensure that we have a completely clean slate after reapiring.
testutil.Equals(t, w.segment.Index(), 1) // We corrupted segment 0.
testutil.Equals(t, w.donePages, 0)
for i := 0; i < 5; i++ {
buf := make([]byte, recordSize)
_, err := rand.Read(buf)
testutil.Ok(t, err)
err = w.Log(buf)
testutil.Ok(t, err)
}
err = w.Close()
testutil.Ok(t, err)
}
// Replay the WAL. Should get 9 records.
{
sr, err := NewSegmentsReader(dir)
testutil.Ok(t, err)
reader := NewReader(sr)
i := 0
for ; i < 9 && reader.Next(); i++ {
testutil.Equals(t, recordSize, len(reader.Record()))
}
testutil.Equals(t, 9, i, "wrong number of records")
testutil.Assert(t, !reader.Next(), "unexpected record")
testutil.Equals(t, nil, reader.Err())
sr.Close()
}
}
// TestClose ensures that calling Close more than once doesn't panic and doesn't block.
func TestClose(t *testing.T) {
dir, err := ioutil.TempDir("", "wal_repair")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
w, err := NewSize(nil, nil, dir, pageSize, false)
testutil.Ok(t, err)
testutil.Ok(t, w.Close())
testutil.NotOk(t, w.Close())
}
func TestSegmentMetric(t *testing.T) {
var (
segmentSize = pageSize
recordSize = (pageSize / 2) - recordHeaderSize
)
dir, err := ioutil.TempDir("", "segment_metric")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
w, err := NewSize(nil, nil, dir, segmentSize, false)
testutil.Ok(t, err)
initialSegment := client_testutil.ToFloat64(w.currentSegment)
// Write 3 records, each of which is half the segment size, meaning we should rotate to the next segment.
for i := 0; i < 3; i++ {
buf := make([]byte, recordSize)
_, err := rand.Read(buf)
testutil.Ok(t, err)
err = w.Log(buf)
testutil.Ok(t, err)
}
testutil.Assert(t, client_testutil.ToFloat64(w.currentSegment) == initialSegment+1, "segment metric did not increment after segment rotation")
testutil.Ok(t, w.Close())
}
func TestCompression(t *testing.T) {
boostrap := func(compressed bool) string {
const (
segmentSize = pageSize
recordSize = (pageSize / 2) - recordHeaderSize
records = 100
)
dirPath, err := ioutil.TempDir("", fmt.Sprintf("TestCompression_%t", compressed))
testutil.Ok(t, err)
w, err := NewSize(nil, nil, dirPath, segmentSize, compressed)
testutil.Ok(t, err)
buf := make([]byte, recordSize)
for i := 0; i < records; i++ {
testutil.Ok(t, w.Log(buf))
}
testutil.Ok(t, w.Close())
return dirPath
}
dirCompressed := boostrap(true)
defer func() {
testutil.Ok(t, os.RemoveAll(dirCompressed))
}()
dirUnCompressed := boostrap(false)
defer func() {
testutil.Ok(t, os.RemoveAll(dirUnCompressed))
}()
uncompressedSize := testutil.DirSize(t, dirUnCompressed)
compressedSize := testutil.DirSize(t, dirCompressed)
testutil.Assert(t, float64(uncompressedSize)*0.75 > float64(compressedSize), "Compressing zeroes should save at least 25%% space - uncompressedSize: %d, compressedSize: %d", uncompressedSize, compressedSize)
}
func BenchmarkWAL_LogBatched(b *testing.B) {
for _, compress := range []bool{true, false} {
b.Run(fmt.Sprintf("compress=%t", compress), func(b *testing.B) {
dir, err := ioutil.TempDir("", "bench_logbatch")
testutil.Ok(b, err)
defer func() {
testutil.Ok(b, os.RemoveAll(dir))
}()
w, err := New(nil, nil, dir, compress)
testutil.Ok(b, err)
defer w.Close()
var buf [2048]byte
var recs [][]byte
b.SetBytes(2048)
for i := 0; i < b.N; i++ {
recs = append(recs, buf[:])
if len(recs) < 1000 {
continue
}
err := w.Log(recs...)
testutil.Ok(b, err)
recs = recs[:0]
}
// Stop timer to not count fsync time on close.
// If it's counted batched vs. single benchmarks are very similar but
// do not show burst throughput well.
b.StopTimer()
})
}
}
func BenchmarkWAL_Log(b *testing.B) {
for _, compress := range []bool{true, false} {
b.Run(fmt.Sprintf("compress=%t", compress), func(b *testing.B) {
dir, err := ioutil.TempDir("", "bench_logsingle")
testutil.Ok(b, err)
defer func() {
testutil.Ok(b, os.RemoveAll(dir))
}()
w, err := New(nil, nil, dir, compress)
testutil.Ok(b, err)
defer w.Close()
var buf [2048]byte
b.SetBytes(2048)
for i := 0; i < b.N; i++ {
err := w.Log(buf[:])
testutil.Ok(b, err)
}
// Stop timer to not count fsync time on close.
// If it's counted batched vs. single benchmarks are very similar but
// do not show burst throughput well.
b.StopTimer()
})
}
}

566
tsdb/wal_test.go Normal file
View file

@ -0,0 +1,566 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !windows
package tsdb
import (
"encoding/binary"
"io"
"io/ioutil"
"math/rand"
"os"
"path"
"path/filepath"
"testing"
"time"
"github.com/go-kit/kit/log"
"github.com/prometheus/tsdb/fileutil"
"github.com/prometheus/tsdb/labels"
"github.com/prometheus/tsdb/testutil"
"github.com/prometheus/tsdb/wal"
)
func TestSegmentWAL_cut(t *testing.T) {
tmpdir, err := ioutil.TempDir("", "test_wal_cut")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(tmpdir))
}()
// This calls cut() implicitly the first time without a previous tail.
w, err := OpenSegmentWAL(tmpdir, nil, 0, nil)
testutil.Ok(t, err)
testutil.Ok(t, w.write(WALEntrySeries, 1, []byte("Hello World!!")))
testutil.Ok(t, w.cut())
// Cutting creates a new file.
testutil.Equals(t, 2, len(w.files))
testutil.Ok(t, w.write(WALEntrySeries, 1, []byte("Hello World!!")))
testutil.Ok(t, w.Close())
for _, of := range w.files {
f, err := os.Open(of.Name())
testutil.Ok(t, err)
// Verify header data.
metab := make([]byte, 8)
_, err = f.Read(metab)
testutil.Ok(t, err)
testutil.Equals(t, WALMagic, binary.BigEndian.Uint32(metab[:4]))
testutil.Equals(t, WALFormatDefault, metab[4])
// We cannot actually check for correct pre-allocation as it is
// optional per filesystem and handled transparently.
et, flag, b, err := newWALReader(nil, nil).entry(f)
testutil.Ok(t, err)
testutil.Equals(t, WALEntrySeries, et)
testutil.Equals(t, byte(walSeriesSimple), flag)
testutil.Equals(t, []byte("Hello World!!"), b)
}
}
func TestSegmentWAL_Truncate(t *testing.T) {
const (
numMetrics = 20000
batch = 100
)
series, err := labels.ReadLabels(filepath.Join("testdata", "20kseries.json"), numMetrics)
testutil.Ok(t, err)
dir, err := ioutil.TempDir("", "test_wal_log_truncate")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
w, err := OpenSegmentWAL(dir, nil, 0, nil)
testutil.Ok(t, err)
w.segmentSize = 10000
for i := 0; i < numMetrics; i += batch {
var rs []RefSeries
for j, s := range series[i : i+batch] {
rs = append(rs, RefSeries{Labels: s, Ref: uint64(i+j) + 1})
}
err := w.LogSeries(rs)
testutil.Ok(t, err)
}
// We mark the 2nd half of the files with a min timestamp that should discard
// them from the selection of compactable files.
for i, f := range w.files[len(w.files)/2:] {
f.maxTime = int64(1000 + i)
}
// All series in those files must be preserved regarding of the provided postings list.
boundarySeries := w.files[len(w.files)/2].minSeries
// We truncate while keeping every 2nd series.
keep := map[uint64]struct{}{}
for i := 1; i <= numMetrics; i += 2 {
keep[uint64(i)] = struct{}{}
}
keepf := func(id uint64) bool {
_, ok := keep[id]
return ok
}
err = w.Truncate(1000, keepf)
testutil.Ok(t, err)
var expected []RefSeries
for i := 1; i <= numMetrics; i++ {
if i%2 == 1 || uint64(i) >= boundarySeries {
expected = append(expected, RefSeries{Ref: uint64(i), Labels: series[i-1]})
}
}
// Call Truncate once again to see whether we can read the written file without
// creating a new WAL.
err = w.Truncate(1000, keepf)
testutil.Ok(t, err)
testutil.Ok(t, w.Close())
// The same again with a new WAL.
w, err = OpenSegmentWAL(dir, nil, 0, nil)
testutil.Ok(t, err)
var readSeries []RefSeries
r := w.Reader()
testutil.Ok(t, r.Read(func(s []RefSeries) {
readSeries = append(readSeries, s...)
}, nil, nil))
testutil.Equals(t, expected, readSeries)
}
// Symmetrical test of reading and writing to the WAL via its main interface.
func TestSegmentWAL_Log_Restore(t *testing.T) {
const (
numMetrics = 50
iterations = 5
stepSize = 5
)
// Generate testing data. It does not make semantical sense but
// for the purpose of this test.
series, err := labels.ReadLabels(filepath.Join("testdata", "20kseries.json"), numMetrics)
testutil.Ok(t, err)
dir, err := ioutil.TempDir("", "test_wal_log_restore")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
var (
recordedSeries [][]RefSeries
recordedSamples [][]RefSample
recordedDeletes [][]Stone
)
var totalSamples int
// Open WAL a bunch of times, validate all previous data can be read,
// write more data to it, close it.
for k := 0; k < numMetrics; k += numMetrics / iterations {
w, err := OpenSegmentWAL(dir, nil, 0, nil)
testutil.Ok(t, err)
// Set smaller segment size so we can actually write several files.
w.segmentSize = 1000 * 1000
r := w.Reader()
var (
resultSeries [][]RefSeries
resultSamples [][]RefSample
resultDeletes [][]Stone
)
serf := func(series []RefSeries) {
if len(series) > 0 {
clsets := make([]RefSeries, len(series))
copy(clsets, series)
resultSeries = append(resultSeries, clsets)
}
}
smplf := func(smpls []RefSample) {
if len(smpls) > 0 {
csmpls := make([]RefSample, len(smpls))
copy(csmpls, smpls)
resultSamples = append(resultSamples, csmpls)
}
}
delf := func(stones []Stone) {
if len(stones) > 0 {
cst := make([]Stone, len(stones))
copy(cst, stones)
resultDeletes = append(resultDeletes, cst)
}
}
testutil.Ok(t, r.Read(serf, smplf, delf))
testutil.Equals(t, recordedSamples, resultSamples)
testutil.Equals(t, recordedSeries, resultSeries)
testutil.Equals(t, recordedDeletes, resultDeletes)
series := series[k : k+(numMetrics/iterations)]
// Insert in batches and generate different amounts of samples for each.
for i := 0; i < len(series); i += stepSize {
var samples []RefSample
var stones []Stone
for j := 0; j < i*10; j++ {
samples = append(samples, RefSample{
Ref: uint64(j % 10000),
T: int64(j * 2),
V: rand.Float64(),
})
}
for j := 0; j < i*20; j++ {
ts := rand.Int63()
stones = append(stones, Stone{rand.Uint64(), Intervals{{ts, ts + rand.Int63n(10000)}}})
}
lbls := series[i : i+stepSize]
series := make([]RefSeries, 0, len(series))
for j, l := range lbls {
series = append(series, RefSeries{
Ref: uint64(i + j),
Labels: l,
})
}
testutil.Ok(t, w.LogSeries(series))
testutil.Ok(t, w.LogSamples(samples))
testutil.Ok(t, w.LogDeletes(stones))
if len(lbls) > 0 {
recordedSeries = append(recordedSeries, series)
}
if len(samples) > 0 {
recordedSamples = append(recordedSamples, samples)
totalSamples += len(samples)
}
if len(stones) > 0 {
recordedDeletes = append(recordedDeletes, stones)
}
}
testutil.Ok(t, w.Close())
}
}
func TestWALRestoreCorrupted_invalidSegment(t *testing.T) {
dir, err := ioutil.TempDir("", "test_wal_log_restore")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
wal, err := OpenSegmentWAL(dir, nil, 0, nil)
testutil.Ok(t, err)
_, err = wal.createSegmentFile(filepath.Join(dir, "000000"))
testutil.Ok(t, err)
f, err := wal.createSegmentFile(filepath.Join(dir, "000001"))
testutil.Ok(t, err)
f2, err := wal.createSegmentFile(filepath.Join(dir, "000002"))
testutil.Ok(t, err)
testutil.Ok(t, f2.Close())
// Make header of second segment invalid.
_, err = f.WriteAt([]byte{1, 2, 3, 4}, 0)
testutil.Ok(t, err)
testutil.Ok(t, f.Close())
testutil.Ok(t, wal.Close())
_, err = OpenSegmentWAL(dir, log.NewLogfmtLogger(os.Stderr), 0, nil)
testutil.Ok(t, err)
fns, err := fileutil.ReadDir(dir)
testutil.Ok(t, err)
testutil.Equals(t, []string{"000000"}, fns)
}
// Test reading from a WAL that has been corrupted through various means.
func TestWALRestoreCorrupted(t *testing.T) {
cases := []struct {
name string
f func(*testing.T, *SegmentWAL)
}{
{
name: "truncate_checksum",
f: func(t *testing.T, w *SegmentWAL) {
f, err := os.OpenFile(w.files[0].Name(), os.O_WRONLY, 0666)
testutil.Ok(t, err)
defer f.Close()
off, err := f.Seek(0, io.SeekEnd)
testutil.Ok(t, err)
testutil.Ok(t, f.Truncate(off-1))
},
},
{
name: "truncate_body",
f: func(t *testing.T, w *SegmentWAL) {
f, err := os.OpenFile(w.files[0].Name(), os.O_WRONLY, 0666)
testutil.Ok(t, err)
defer f.Close()
off, err := f.Seek(0, io.SeekEnd)
testutil.Ok(t, err)
testutil.Ok(t, f.Truncate(off-8))
},
},
{
name: "body_content",
f: func(t *testing.T, w *SegmentWAL) {
f, err := os.OpenFile(w.files[0].Name(), os.O_WRONLY, 0666)
testutil.Ok(t, err)
defer f.Close()
off, err := f.Seek(0, io.SeekEnd)
testutil.Ok(t, err)
// Write junk before checksum starts.
_, err = f.WriteAt([]byte{1, 2, 3, 4}, off-8)
testutil.Ok(t, err)
},
},
{
name: "checksum",
f: func(t *testing.T, w *SegmentWAL) {
f, err := os.OpenFile(w.files[0].Name(), os.O_WRONLY, 0666)
testutil.Ok(t, err)
defer f.Close()
off, err := f.Seek(0, io.SeekEnd)
testutil.Ok(t, err)
// Write junk into checksum
_, err = f.WriteAt([]byte{1, 2, 3, 4}, off-4)
testutil.Ok(t, err)
},
},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
// Generate testing data. It does not make semantical sense but
// for the purpose of this test.
dir, err := ioutil.TempDir("", "test_corrupted")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
w, err := OpenSegmentWAL(dir, nil, 0, nil)
testutil.Ok(t, err)
testutil.Ok(t, w.LogSamples([]RefSample{{T: 1, V: 2}}))
testutil.Ok(t, w.LogSamples([]RefSample{{T: 2, V: 3}}))
testutil.Ok(t, w.cut())
// Sleep 2 seconds to avoid error where cut and test "cases" function may write or
// truncate the file out of orders as "cases" are not synchronized with cut.
// Hopefully cut will complete by 2 seconds.
time.Sleep(2 * time.Second)
testutil.Ok(t, w.LogSamples([]RefSample{{T: 3, V: 4}}))
testutil.Ok(t, w.LogSamples([]RefSample{{T: 5, V: 6}}))
testutil.Ok(t, w.Close())
// cut() truncates and fsyncs the first segment async. If it happens after
// the corruption we apply below, the corruption will be overwritten again.
// Fire and forget a sync to avoid flakyness.
w.files[0].Sync()
// Corrupt the second entry in the first file.
// After re-opening we must be able to read the first entry
// and the rest, including the second file, must be truncated for clean further
// writes.
c.f(t, w)
logger := log.NewLogfmtLogger(os.Stderr)
w2, err := OpenSegmentWAL(dir, logger, 0, nil)
testutil.Ok(t, err)
r := w2.Reader()
serf := func(l []RefSeries) {
testutil.Equals(t, 0, len(l))
}
// Weird hack to check order of reads.
i := 0
samplf := func(s []RefSample) {
if i == 0 {
testutil.Equals(t, []RefSample{{T: 1, V: 2}}, s)
i++
} else {
testutil.Equals(t, []RefSample{{T: 99, V: 100}}, s)
}
}
testutil.Ok(t, r.Read(serf, samplf, nil))
testutil.Ok(t, w2.LogSamples([]RefSample{{T: 99, V: 100}}))
testutil.Ok(t, w2.Close())
// We should see the first valid entry and the new one, everything after
// is truncated.
w3, err := OpenSegmentWAL(dir, logger, 0, nil)
testutil.Ok(t, err)
r = w3.Reader()
i = 0
testutil.Ok(t, r.Read(serf, samplf, nil))
})
}
}
func TestMigrateWAL_Empty(t *testing.T) {
// The migration proecedure must properly deal with a zero-length segment,
// which is valid in the new format.
dir, err := ioutil.TempDir("", "walmigrate")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
wdir := path.Join(dir, "wal")
// Initialize empty WAL.
w, err := wal.New(nil, nil, wdir, false)
testutil.Ok(t, err)
testutil.Ok(t, w.Close())
testutil.Ok(t, MigrateWAL(nil, wdir))
}
func TestMigrateWAL_Fuzz(t *testing.T) {
dir, err := ioutil.TempDir("", "walmigrate")
testutil.Ok(t, err)
defer func() {
testutil.Ok(t, os.RemoveAll(dir))
}()
wdir := path.Join(dir, "wal")
// Should pass if no WAL exists yet.
testutil.Ok(t, MigrateWAL(nil, wdir))
oldWAL, err := OpenSegmentWAL(wdir, nil, time.Minute, nil)
testutil.Ok(t, err)
// Write some data.
testutil.Ok(t, oldWAL.LogSeries([]RefSeries{
{Ref: 100, Labels: labels.FromStrings("abc", "def", "123", "456")},
{Ref: 1, Labels: labels.FromStrings("abc", "def2", "1234", "4567")},
}))
testutil.Ok(t, oldWAL.LogSamples([]RefSample{
{Ref: 1, T: 100, V: 200},
{Ref: 2, T: 300, V: 400},
}))
testutil.Ok(t, oldWAL.LogSeries([]RefSeries{
{Ref: 200, Labels: labels.FromStrings("xyz", "def", "foo", "bar")},
}))
testutil.Ok(t, oldWAL.LogSamples([]RefSample{
{Ref: 3, T: 100, V: 200},
{Ref: 4, T: 300, V: 400},
}))
testutil.Ok(t, oldWAL.LogDeletes([]Stone{
{ref: 1, intervals: []Interval{{100, 200}}},
}))
testutil.Ok(t, oldWAL.Close())
// Perform migration.
testutil.Ok(t, MigrateWAL(nil, wdir))
w, err := wal.New(nil, nil, wdir, false)
testutil.Ok(t, err)
// We can properly write some new data after migration.
var enc RecordEncoder
testutil.Ok(t, w.Log(enc.Samples([]RefSample{
{Ref: 500, T: 1, V: 1},
}, nil)))
testutil.Ok(t, w.Close())
// Read back all data.
sr, err := wal.NewSegmentsReader(wdir)
testutil.Ok(t, err)
r := wal.NewReader(sr)
var res []interface{}
var dec RecordDecoder
for r.Next() {
rec := r.Record()
switch dec.Type(rec) {
case RecordSeries:
s, err := dec.Series(rec, nil)
testutil.Ok(t, err)
res = append(res, s)
case RecordSamples:
s, err := dec.Samples(rec, nil)
testutil.Ok(t, err)
res = append(res, s)
case RecordTombstones:
s, err := dec.Tombstones(rec, nil)
testutil.Ok(t, err)
res = append(res, s)
default:
t.Fatalf("unknown record type %d", dec.Type(rec))
}
}
testutil.Ok(t, r.Err())
testutil.Equals(t, []interface{}{
[]RefSeries{
{Ref: 100, Labels: labels.FromStrings("abc", "def", "123", "456")},
{Ref: 1, Labels: labels.FromStrings("abc", "def2", "1234", "4567")},
},
[]RefSample{{Ref: 1, T: 100, V: 200}, {Ref: 2, T: 300, V: 400}},
[]RefSeries{
{Ref: 200, Labels: labels.FromStrings("xyz", "def", "foo", "bar")},
},
[]RefSample{{Ref: 3, T: 100, V: 200}, {Ref: 4, T: 300, V: 400}},
[]Stone{{ref: 1, intervals: []Interval{{100, 200}}}},
[]RefSample{{Ref: 500, T: 1, V: 1}},
}, res)
// Migrating an already migrated WAL shouldn't do anything.
testutil.Ok(t, MigrateWAL(nil, wdir))
}