Merge remote-tracking branch 'tsdb/merge-tsdb' into merge-tsdb-new

2025-03-05 20:59:13 -08:00 · 2019-08-13 13:59:13 +05:30 · 2019-08-13 13:59:13 +05:30 · 750e438ebb
parent 6e53980bbf 7cf09b0395
commit 750e438ebb
98 changed files with 48708 additions and 0 deletions
--- a/tsdb/.github/PULL_REQUEST_TEMPLATE.md
+++ b/tsdb/.github/PULL_REQUEST_TEMPLATE.md
@ -0,0 +1,17 @@
+<!--
+    Don't forget!
+    
+    - Most PRs would require a CHANGELOG entry.
+    
+    - If the PR adds or changes a behaviour or fixes a bug of an exported API it would need a unit/e2e test.
+    
+    - Where possible use only exported APIs for tests to simplify the review and make it as close as possible to an actual library usage.
+    
+    - No tests are needed for internal implementation changes.
+    
+    - Performance improvements would need a benchmark test to prove it.
+    
+    - All exposed objects should have a comment.
+    
+    - All comments should start with a capital letter and end with a full stop.
+ -->
--- a/tsdb/.gitignore
+++ b/tsdb/.gitignore
@ -0,0 +1 @@
+benchout/
--- a/tsdb/.golangci.yml
+++ b/tsdb/.golangci.yml
@ -0,0 +1,5 @@
+# Run only staticcheck for now. Additional linters will be enabled one-by-one.
+linters:
+  enable:
+  - staticcheck
+  disable-all: true
--- a/tsdb/.travis.yml
+++ b/tsdb/.travis.yml
@ -0,0 +1,20 @@
+dist: trusty
+language: go
+os:
+  - windows
+  - linux
+  - osx
+
+go:
+  - 1.12.x
+
+go_import_path: github.com/prometheus/tsdb
+
+before_install:
+  - if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then choco install make; fi
+
+install:
+  - make deps
+
+script:
+  - if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then make test; else make all; fi
--- a/tsdb/CHANGELOG.md
+++ b/tsdb/CHANGELOG.md
@ -0,0 +1,108 @@
+## master / unreleased
+
+## 0.10.0
+
+ - [FEATURE] Added `DBReadOnly` to allow opening a database in read only mode.
+    - `DBReadOnly.Blocks()` exposes a slice of `BlockReader`s.
+    - `BlockReader` interface - removed MinTime/MaxTime methods and now exposes the full block meta via `Meta()`.
+ - [FEATURE] `chunckenc.Chunk.Iterator` method now takes a `chunckenc.Iterator` interface as an argument for reuse.
+
+## 0.9.1
+
+ - [CHANGE] LiveReader metrics are now injected rather than global.
+
+## 0.9.0
+
+ - [FEATURE] Provide option to compress WAL records using Snappy. [#609](https://github.com/prometheus/tsdb/pull/609)
+ - [BUGFIX] Re-calculate block size when calling `block.Delete`.
+ - [BUGFIX] Re-encode all head chunks at compaction that are open (being appended to) or outside the Maxt block range. This avoids writing out corrupt data. It happens when snapshotting with the head included.
+ - [BUGFIX] Improved handling of multiple refs for the same series in WAL reading.
+ - [BUGFIX] `prometheus_tsdb_compactions_failed_total` is now incremented on any compaction failure.
+ - [CHANGE] The meta file `BlockStats` no longer holds size information. This is now dynamically calculated and kept in memory. It also includes the meta file size which was not included before.
+ - [CHANGE] Create new clean segment when starting the WAL.
+ - [CHANGE] Renamed metric from `prometheus_tsdb_wal_reader_corruption_errors` to `prometheus_tsdb_wal_reader_corruption_errors_total`.
+ - [ENHANCEMENT] Improved atomicity of .tmp block replacement during compaction for usual case.
+ - [ENHANCEMENT] Improved postings intersection matching.
+ - [ENHANCEMENT] Reduced disk usage for WAL for small setups.
+ - [ENHANCEMENT] Optimize queries using regexp for set lookups.
+
+
+## 0.8.0
+
+ - [BUGFIX] Calling `Close` more than once on a querier returns an error instead of a panic.
+ - [BUGFIX] Don't panic and recover nicely when running out of disk space.
+ - [BUGFIX] Correctly handle empty labels.
+ - [BUGFIX] Don't crash on an unknown tombstone ref.
+ - [ENHANCEMENT] Re-add FromData function to create a chunk from bytes. It is used by Cortex and Thanos.
+ - [ENHANCEMENT] Simplify mergedPostings.Seek.
+ - [FEATURE]  Added `currentSegment` metric for the current WAL segment it is being written to.
+
+## 0.7.1
+
+ - [ENHANCEMENT] Reduce memory usage in mergedPostings.Seek
+
+## 0.7.0
+
+ - [CHANGE] tsdb now requires golang 1.12 or higher.
+ - [REMOVED] `chunks.NewReader` is removed as it wasn't used anywhere.
+ - [REMOVED] `FromData` is considered unused so was removed.
+ - [FEATURE] Added option WALSegmentSize -1 to disable the WAL.
+ - [BUGFIX] Bugfix in selectOverlappingDirs. Only return the first overlapping blocks.
+ - [BUGFIX] Fsync the meta file to persist it on disk to avoid data loss in case of a host crash.
+ - [BUGFIX] Fix fd and vm_area leak on error path in chunks.NewDirReader.
+ - [BUGFIX] Fix fd and vm_area leak on error path in index.NewFileReader.
+ - [BUGFIX] Force persisting the tombstone file to avoid data loss in case of a host crash.
+ - [BUGFIX] Keep series that are still in WAL in checkpoints.
+ - [ENHANCEMENT] Fast path for EmptyPostings cases in Merge, Intersect and Without.
+ - [ENHANCEMENT] Be smarter in how we look at matchers.
+ - [ENHANCEMENT] PostListings and NotMatcher now public.
+
+## 0.6.1
+
+  - [BUGFIX] Update `last` after appending a non-overlapping chunk in `chunks.MergeOverlappingChunks`. [#539](https://github.com/prometheus/tsdb/pull/539)
+
+## 0.6.0
+
+  - [CHANGE] `AllowOverlappingBlock` is now `AllowOverlappingBlocks`.
+
+## 0.5.0
+
+ - [FEATURE] Time-ovelapping blocks are now allowed. [#370](https://github.com/prometheus/tsdb/pull/370)
+   - Disabled by default and can be enabled via `AllowOverlappingBlock` option.
+   - Added `MergeChunks` function in `chunkenc/xor.go` to merge 2 time-overlapping chunks.
+   - Added `MergeOverlappingChunks` function in `chunks/chunks.go` to merge multiple time-overlapping Chunk Metas.
+   - Added `MinTime` and `MaxTime` method for `BlockReader`.
+ - [FEATURE] New `dump` command to tsdb tool to dump all samples.
+ - [FEATURE] New `encoding` package for common binary encoding/decoding helpers.
+    - Added to remove some code duplication.
+ - [ENHANCEMENT] When closing the db any running compaction will be cancelled so it doesn't block.
+   - `NewLeveledCompactor` takes a context.
+ - [CHANGE] `prometheus_tsdb_storage_blocks_bytes_total` is now `prometheus_tsdb_storage_blocks_bytes`.
+ - [BUGFIX] Improved Postings Merge performance. Fixes a regression from the the previous release.
+ - [BUGFIX] LiveReader can get into an infinite loop on corrupt WALs.
+
+## 0.4.0
+
+ - [CHANGE] New `WALSegmentSize` option to override the `DefaultOptions.WALSegmentSize`. Added to allow using smaller wal files. For example using tmpfs on a RPI to minimise the SD card wear out from the constant WAL writes. As part of this change the `DefaultOptions.WALSegmentSize` constant was also exposed.
+ - [CHANGE] Empty blocks are not written during compaction [#374](https://github.com/prometheus/tsdb/pull/374)
+ - [FEATURE]  Size base retention through `Options.MaxBytes`.  As part of this change:
+   - Added new metrics - `prometheus_tsdb_storage_blocks_bytes_total`, `prometheus_tsdb_size_retentions_total`, `prometheus_tsdb_time_retentions_total`
+   - New public interface `SizeReader: Size() int64`
+   - `OpenBlock` signature changed to take a logger.
+ - [REMOVED] `PrefixMatcher` is considered unused so was removed.
+ - [CLEANUP] `Options.WALFlushInterval` is removed as it wasn't used anywhere.
+ - [FEATURE] Add new `LiveReader` to WAL pacakge. Added to allow live tailing of a WAL segment, used by Prometheus Remote Write after refactor. The main difference between the new reader and the existing `Reader` is that for `LiveReader` a call to `Next()` that returns false does not mean that there will never be more data to read.
+
+## 0.3.1
+
+ - [BUGFIX] Fixed most windows test and some actual bugs for unclosed file readers.
+
+## 0.3.0
+
+ - [CHANGE] `LastCheckpoint()` used to return just the segment name and now it returns the full relative path.
+ - [CHANGE] `NewSegmentsRangeReader()` can now read over miltiple wal ranges by using the new `SegmentRange{}` struct.
+ - [CHANGE] `CorruptionErr{}` now also exposes the Segment `Dir` which is added when displaying any errors.
+ - [CHANGE] `Head.Init()` is changed to `Head.Init(minValidTime int64)`
+ - [CHANGE] `SymbolTable()` renamed to `SymbolTableSize()` to make the name consistent with the  `Block{ symbolTableSize uint64 }` field.
+ - [CHANGE] `wal.Reader{}` now exposes `Segment()` for the current segment being read  and `Offset()` for the current offset.
+ - [FEATURE] tsdbutil analyze subcomand to find churn, high cardinality, etc.
--- a/tsdb/LICENSE
+++ b/tsdb/LICENSE
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/tsdb/MAINTAINERS.md
+++ b/tsdb/MAINTAINERS.md
@ -0,0 +1,4 @@
+Maintainers of this repository:
+
+* Krasi Georgiev <kgeorgie@redhat.com> @krasi-georgiev
+* Goutham Veeramachaneni <gouthamve@gmail.com> @gouthamve
--- a/tsdb/Makefile
+++ b/tsdb/Makefile
@ -0,0 +1,33 @@
+# Copyright 2018 The Prometheus Authors
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+TSDB_PROJECT_DIR = "."
+TSDB_CLI_DIR="$(TSDB_PROJECT_DIR)/cmd/tsdb"
+TSDB_BIN = "$(TSDB_CLI_DIR)/tsdb"
+TSDB_BENCHMARK_NUM_METRICS ?= 1000
+TSDB_BENCHMARK_DATASET ?= "$(TSDB_PROJECT_DIR)/testdata/20kseries.json"
+TSDB_BENCHMARK_OUTPUT_DIR ?= "$(TSDB_CLI_DIR)/benchout"
+
+include Makefile.common
+
+build:
+	GO111MODULE=$(GO111MODULE) $(GO) build -o $(TSDB_BIN) $(TSDB_CLI_DIR)
+
+bench: build
+	@echo ">> running benchmark, writing result to $(TSDB_BENCHMARK_OUTPUT_DIR)"
+	@$(TSDB_BIN) bench write --metrics=$(TSDB_BENCHMARK_NUM_METRICS) --out=$(TSDB_BENCHMARK_OUTPUT_DIR) $(TSDB_BENCHMARK_DATASET)
+	@$(GO) tool pprof -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/cpu.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/cpuprof.svg
+	@$(GO) tool pprof --inuse_space -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/mem.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/memprof.inuse.svg
+	@$(GO) tool pprof --alloc_space -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/mem.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/memprof.alloc.svg
+	@$(GO) tool pprof -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/block.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/blockprof.svg
+	@$(GO) tool pprof -svg $(TSDB_BIN) $(TSDB_BENCHMARK_OUTPUT_DIR)/mutex.prof > $(TSDB_BENCHMARK_OUTPUT_DIR)/mutexprof.svg
--- a/tsdb/Makefile.common
+++ b/tsdb/Makefile.common
@ -0,0 +1,277 @@
+# Copyright 2018 The Prometheus Authors
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# A common Makefile that includes rules to be reused in different prometheus projects.
+# !!! Open PRs only against the prometheus/prometheus/Makefile.common repository!
+
+# Example usage :
+# Create the main Makefile in the root project directory.
+# include Makefile.common
+# customTarget:
+# 	@echo ">> Running customTarget"
+#
+
+# Ensure GOBIN is not set during build so that promu is installed to the correct path
+unexport GOBIN
+
+GO           ?= go
+GOFMT        ?= $(GO)fmt
+FIRST_GOPATH := $(firstword $(subst :, ,$(shell $(GO) env GOPATH)))
+GOOPTS       ?=
+GOHOSTOS     ?= $(shell $(GO) env GOHOSTOS)
+GOHOSTARCH   ?= $(shell $(GO) env GOHOSTARCH)
+
+GO_VERSION        ?= $(shell $(GO) version)
+GO_VERSION_NUMBER ?= $(word 3, $(GO_VERSION))
+PRE_GO_111        ?= $(shell echo $(GO_VERSION_NUMBER) | grep -E 'go1\.(10|[0-9])\.')
+
+GOVENDOR :=
+GO111MODULE :=
+ifeq (, $(PRE_GO_111))
+	ifneq (,$(wildcard go.mod))
+		# Enforce Go modules support just in case the directory is inside GOPATH (and for Travis CI).
+		GO111MODULE := on
+
+		ifneq (,$(wildcard vendor))
+			# Always use the local vendor/ directory to satisfy the dependencies.
+			GOOPTS := $(GOOPTS) -mod=vendor
+		endif
+	endif
+else
+	ifneq (,$(wildcard go.mod))
+		ifneq (,$(wildcard vendor))
+$(warning This repository requires Go >= 1.11 because of Go modules)
+$(warning Some recipes may not work as expected as the current Go runtime is '$(GO_VERSION_NUMBER)')
+		endif
+	else
+		# This repository isn't using Go modules (yet).
+		GOVENDOR := $(FIRST_GOPATH)/bin/govendor
+	endif
+endif
+PROMU        := $(FIRST_GOPATH)/bin/promu
+pkgs          = ./...
+
+ifeq (arm, $(GOHOSTARCH))
+	GOHOSTARM ?= $(shell GOARM= $(GO) env GOARM)
+	GO_BUILD_PLATFORM ?= $(GOHOSTOS)-$(GOHOSTARCH)v$(GOHOSTARM)
+else
+	GO_BUILD_PLATFORM ?= $(GOHOSTOS)-$(GOHOSTARCH)
+endif
+
+PROMU_VERSION ?= 0.5.0
+PROMU_URL     := https://github.com/prometheus/promu/releases/download/v$(PROMU_VERSION)/promu-$(PROMU_VERSION).$(GO_BUILD_PLATFORM).tar.gz
+
+GOLANGCI_LINT :=
+GOLANGCI_LINT_OPTS ?=
+GOLANGCI_LINT_VERSION ?= v1.17.1
+# golangci-lint only supports linux, darwin and windows platforms on i386/amd64.
+# windows isn't included here because of the path separator being different.
+ifeq ($(GOHOSTOS),$(filter $(GOHOSTOS),linux darwin))
+	ifeq ($(GOHOSTARCH),$(filter $(GOHOSTARCH),amd64 i386))
+		GOLANGCI_LINT := $(FIRST_GOPATH)/bin/golangci-lint
+	endif
+endif
+
+PREFIX                  ?= $(shell pwd)
+BIN_DIR                 ?= $(shell pwd)
+DOCKER_IMAGE_TAG        ?= $(subst /,-,$(shell git rev-parse --abbrev-ref HEAD))
+DOCKERFILE_PATH         ?= ./Dockerfile
+DOCKERBUILD_CONTEXT     ?= ./
+DOCKER_REPO             ?= prom
+
+DOCKER_ARCHS            ?= amd64
+
+BUILD_DOCKER_ARCHS = $(addprefix common-docker-,$(DOCKER_ARCHS))
+PUBLISH_DOCKER_ARCHS = $(addprefix common-docker-publish-,$(DOCKER_ARCHS))
+TAG_DOCKER_ARCHS = $(addprefix common-docker-tag-latest-,$(DOCKER_ARCHS))
+
+ifeq ($(GOHOSTARCH),amd64)
+        ifeq ($(GOHOSTOS),$(filter $(GOHOSTOS),linux freebsd darwin windows))
+                # Only supported on amd64
+                test-flags := -race
+        endif
+endif
+
+# This rule is used to forward a target like "build" to "common-build".  This
+# allows a new "build" target to be defined in a Makefile which includes this
+# one and override "common-build" without override warnings.
+%: common-% ;
+
+.PHONY: common-all
+common-all: precheck style check_license lint unused build test
+
+.PHONY: common-style
+common-style:
+	@echo ">> checking code style"
+	@fmtRes=$$($(GOFMT) -d $$(find . -path ./vendor -prune -o -name '*.go' -print)); \
+	if [ -n "$${fmtRes}" ]; then \
+		echo "gofmt checking failed!"; echo "$${fmtRes}"; echo; \
+		echo "Please ensure you are using $$($(GO) version) for formatting code."; \
+		exit 1; \
+	fi
+
+.PHONY: common-check_license
+common-check_license:
+	@echo ">> checking license header"
+	@licRes=$$(for file in $$(find . -type f -iname '*.go' ! -path './vendor/*') ; do \
+               awk 'NR<=3' $$file | grep -Eq "(Copyright|generated|GENERATED)" || echo $$file; \
+       done); \
+       if [ -n "$${licRes}" ]; then \
+               echo "license header checking failed:"; echo "$${licRes}"; \
+               exit 1; \
+       fi
+
+.PHONY: common-deps
+common-deps:
+	@echo ">> getting dependencies"
+ifdef GO111MODULE
+	GO111MODULE=$(GO111MODULE) $(GO) mod download
+else
+	$(GO) get $(GOOPTS) -t ./...
+endif
+
+.PHONY: common-test-short
+common-test-short:
+	@echo ">> running short tests"
+	GO111MODULE=$(GO111MODULE) $(GO) test -short $(GOOPTS) $(pkgs)
+
+.PHONY: common-test
+common-test:
+	@echo ">> running all tests"
+	GO111MODULE=$(GO111MODULE) $(GO) test $(test-flags) $(GOOPTS) $(pkgs)
+
+.PHONY: common-format
+common-format:
+	@echo ">> formatting code"
+	GO111MODULE=$(GO111MODULE) $(GO) fmt $(pkgs)
+
+.PHONY: common-vet
+common-vet:
+	@echo ">> vetting code"
+	GO111MODULE=$(GO111MODULE) $(GO) vet $(GOOPTS) $(pkgs)
+
+.PHONY: common-lint
+common-lint: $(GOLANGCI_LINT)
+ifdef GOLANGCI_LINT
+	@echo ">> running golangci-lint"
+ifdef GO111MODULE
+# 'go list' needs to be executed before staticcheck to prepopulate the modules cache.
+# Otherwise staticcheck might fail randomly for some reason not yet explained.
+	GO111MODULE=$(GO111MODULE) $(GO) list -e -compiled -test=true -export=false -deps=true -find=false -tags= -- ./... > /dev/null
+	GO111MODULE=$(GO111MODULE) $(GOLANGCI_LINT) run $(GOLANGCI_LINT_OPTS) $(pkgs)
+else
+	$(GOLANGCI_LINT) run $(pkgs)
+endif
+endif
+
+# For backward-compatibility.
+.PHONY: common-staticcheck
+common-staticcheck: lint
+
+.PHONY: common-unused
+common-unused: $(GOVENDOR)
+ifdef GOVENDOR
+	@echo ">> running check for unused packages"
+	@$(GOVENDOR) list +unused | grep . && exit 1 || echo 'No unused packages'
+else
+ifdef GO111MODULE
+	@echo ">> running check for unused/missing packages in go.mod"
+	GO111MODULE=$(GO111MODULE) $(GO) mod tidy
+ifeq (,$(wildcard vendor))
+	@git diff --exit-code -- go.sum go.mod
+else
+	@echo ">> running check for unused packages in vendor/"
+	GO111MODULE=$(GO111MODULE) $(GO) mod vendor
+	@git diff --exit-code -- go.sum go.mod vendor/
+endif
+endif
+endif
+
+.PHONY: common-build
+common-build: promu
+	@echo ">> building binaries"
+	GO111MODULE=$(GO111MODULE) $(PROMU) build --prefix $(PREFIX)
+
+.PHONY: common-tarball
+common-tarball: promu
+	@echo ">> building release tarball"
+	$(PROMU) tarball --prefix $(PREFIX) $(BIN_DIR)
+
+.PHONY: common-docker $(BUILD_DOCKER_ARCHS)
+common-docker: $(BUILD_DOCKER_ARCHS)
+$(BUILD_DOCKER_ARCHS): common-docker-%:
+	docker build -t "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(DOCKER_IMAGE_TAG)" \
+		-f $(DOCKERFILE_PATH) \
+		--build-arg ARCH="$*" \
+		--build-arg OS="linux" \
+		$(DOCKERBUILD_CONTEXT)
+
+.PHONY: common-docker-publish $(PUBLISH_DOCKER_ARCHS)
+common-docker-publish: $(PUBLISH_DOCKER_ARCHS)
+$(PUBLISH_DOCKER_ARCHS): common-docker-publish-%:
+	docker push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(DOCKER_IMAGE_TAG)"
+
+.PHONY: common-docker-tag-latest $(TAG_DOCKER_ARCHS)
+common-docker-tag-latest: $(TAG_DOCKER_ARCHS)
+$(TAG_DOCKER_ARCHS): common-docker-tag-latest-%:
+	docker tag "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:$(DOCKER_IMAGE_TAG)" "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$*:latest"
+
+.PHONY: common-docker-manifest
+common-docker-manifest:
+	DOCKER_CLI_EXPERIMENTAL=enabled docker manifest create -a "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_TAG)" $(foreach ARCH,$(DOCKER_ARCHS),$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-$(ARCH):$(DOCKER_IMAGE_TAG))
+	DOCKER_CLI_EXPERIMENTAL=enabled docker manifest push "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_TAG)"
+
+.PHONY: promu
+promu: $(PROMU)
+
+$(PROMU):
+	$(eval PROMU_TMP := $(shell mktemp -d))
+	curl -s -L $(PROMU_URL) | tar -xvzf - -C $(PROMU_TMP)
+	mkdir -p $(FIRST_GOPATH)/bin
+	cp $(PROMU_TMP)/promu-$(PROMU_VERSION).$(GO_BUILD_PLATFORM)/promu $(FIRST_GOPATH)/bin/promu
+	rm -r $(PROMU_TMP)
+
+.PHONY: proto
+proto:
+	@echo ">> generating code from proto files"
+	@./scripts/genproto.sh
+
+ifdef GOLANGCI_LINT
+$(GOLANGCI_LINT):
+	mkdir -p $(FIRST_GOPATH)/bin
+	curl -sfL https://raw.githubusercontent.com/golangci/golangci-lint/$(GOLANGCI_LINT_VERSION)/install.sh \
+		| sed -e '/install -d/d' \
+		| sh -s -- -b $(FIRST_GOPATH)/bin $(GOLANGCI_LINT_VERSION)
+endif
+
+ifdef GOVENDOR
+.PHONY: $(GOVENDOR)
+$(GOVENDOR):
+	GOOS= GOARCH= $(GO) get -u github.com/kardianos/govendor
+endif
+
+.PHONY: precheck
+precheck::
+
+define PRECHECK_COMMAND_template =
+precheck:: $(1)_precheck
+
+PRECHECK_COMMAND_$(1) ?= $(1) $$(strip $$(PRECHECK_OPTIONS_$(1)))
+.PHONY: $(1)_precheck
+$(1)_precheck:
+	@if ! $$(PRECHECK_COMMAND_$(1)) 1>/dev/null 2>&1; then \
+		echo "Execution of '$$(PRECHECK_COMMAND_$(1))' command failed. Is $(1) installed?"; \
+		exit 1; \
+	fi
+endef
--- a/tsdb/README.md
+++ b/tsdb/README.md
@ -0,0 +1,15 @@
+# TSDB 
+
+[![Build Status](https://travis-ci.org/prometheus/tsdb.svg?branch=master)](https://travis-ci.org/prometheus/tsdb)
+[![GoDoc](https://godoc.org/github.com/prometheus/tsdb?status.svg)](https://godoc.org/github.com/prometheus/tsdb)
+[![Go Report Card](https://goreportcard.com/badge/github.com/prometheus/tsdb)](https://goreportcard.com/report/github.com/prometheus/tsdb)
+
+This repository contains the Prometheus storage layer that is used in its 2.x releases.
+
+A writeup of its design can be found [here](https://fabxc.org/blog/2017-04-10-writing-a-tsdb/).
+
+Based on the Gorilla TSDB [white papers](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf).
+
+Video: [Storing 16 Bytes at Scale](https://youtu.be/b_pEevMAC3I) from [PromCon 2017](https://promcon.io/2017-munich/).
+
+See also the [format documentation](docs/format/README.md).
--- a/tsdb/block.go
+++ b/tsdb/block.go
@ -0,0 +1,656 @@
+// Copyright 2017 The Prometheus Authors
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"encoding/json"
+	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"github.com/go-kit/kit/log"
+	"github.com/go-kit/kit/log/level"
+	"github.com/oklog/ulid"
+	"github.com/pkg/errors"
+	"github.com/prometheus/tsdb/chunkenc"
+	"github.com/prometheus/tsdb/chunks"
+	tsdb_errors "github.com/prometheus/tsdb/errors"
+	"github.com/prometheus/tsdb/fileutil"
+	"github.com/prometheus/tsdb/index"
+	"github.com/prometheus/tsdb/labels"
+)
+
+// IndexWriter serializes the index for a block of series data.
+// The methods must be called in the order they are specified in.
+type IndexWriter interface {
+	// AddSymbols registers all string symbols that are encountered in series
+	// and other indices.
+	AddSymbols(sym map[string]struct{}) error
+
+	// AddSeries populates the index writer with a series and its offsets
+	// of chunks that the index can reference.
+	// Implementations may require series to be insert in increasing order by
+	// their labels.
+	// The reference numbers are used to resolve entries in postings lists that
+	// are added later.
+	AddSeries(ref uint64, l labels.Labels, chunks ...chunks.Meta) error
+
+	// WriteLabelIndex serializes an index from label names to values.
+	// The passed in values chained tuples of strings of the length of names.
+	WriteLabelIndex(names []string, values []string) error
+
+	// WritePostings writes a postings list for a single label pair.
+	// The Postings here contain refs to the series that were added.
+	WritePostings(name, value string, it index.Postings) error
+
+	// Close writes any finalization and closes the resources associated with
+	// the underlying writer.
+	Close() error
+}
+
+// IndexReader provides reading access of serialized index data.
+type IndexReader interface {
+	// Symbols returns a set of string symbols that may occur in series' labels
+	// and indices.
+	Symbols() (map[string]struct{}, error)
+
+	// LabelValues returns the possible label values.
+	LabelValues(names ...string) (index.StringTuples, error)
+
+	// Postings returns the postings list iterator for the label pair.
+	// The Postings here contain the offsets to the series inside the index.
+	// Found IDs are not strictly required to point to a valid Series, e.g. during
+	// background garbage collections.
+	Postings(name, value string) (index.Postings, error)
+
+	// SortedPostings returns a postings list that is reordered to be sorted
+	// by the label set of the underlying series.
+	SortedPostings(index.Postings) index.Postings
+
+	// Series populates the given labels and chunk metas for the series identified
+	// by the reference.
+	// Returns ErrNotFound if the ref does not resolve to a known series.
+	Series(ref uint64, lset *labels.Labels, chks *[]chunks.Meta) error
+
+	// LabelIndices returns a list of string tuples for which a label value index exists.
+	// NOTE: This is deprecated. Use `LabelNames()` instead.
+	LabelIndices() ([][]string, error)
+
+	// LabelNames returns all the unique label names present in the index in sorted order.
+	LabelNames() ([]string, error)
+
+	// Close releases the underlying resources of the reader.
+	Close() error
+}
+
+// StringTuples provides access to a sorted list of string tuples.
+type StringTuples interface {
+	// Total number of tuples in the list.
+	Len() int
+	// At returns the tuple at position i.
+	At(i int) ([]string, error)
+}
+
+// ChunkWriter serializes a time block of chunked series data.
+type ChunkWriter interface {
+	// WriteChunks writes several chunks. The Chunk field of the ChunkMetas
+	// must be populated.
+	// After returning successfully, the Ref fields in the ChunkMetas
+	// are set and can be used to retrieve the chunks from the written data.
+	WriteChunks(chunks ...chunks.Meta) error
+
+	// Close writes any required finalization and closes the resources
+	// associated with the underlying writer.
+	Close() error
+}
+
+// ChunkReader provides reading access of serialized time series data.
+type ChunkReader interface {
+	// Chunk returns the series data chunk with the given reference.
+	Chunk(ref uint64) (chunkenc.Chunk, error)
+
+	// Close releases all underlying resources of the reader.
+	Close() error
+}
+
+// BlockReader provides reading access to a data block.
+type BlockReader interface {
+	// Index returns an IndexReader over the block's data.
+	Index() (IndexReader, error)
+
+	// Chunks returns a ChunkReader over the block's data.
+	Chunks() (ChunkReader, error)
+
+	// Tombstones returns a TombstoneReader over the block's deleted data.
+	Tombstones() (TombstoneReader, error)
+
+	// Meta provides meta information about the block reader.
+	Meta() BlockMeta
+}
+
+// Appendable defines an entity to which data can be appended.
+type Appendable interface {
+	// Appender returns a new Appender against an underlying store.
+	Appender() Appender
+}
+
+// BlockMeta provides meta information about a block.
+type BlockMeta struct {
+	// Unique identifier for the block and its contents. Changes on compaction.
+	ULID ulid.ULID `json:"ulid"`
+
+	// MinTime and MaxTime specify the time range all samples
+	// in the block are in.
+	MinTime int64 `json:"minTime"`
+	MaxTime int64 `json:"maxTime"`
+
+	// Stats about the contents of the block.
+	Stats BlockStats `json:"stats,omitempty"`
+
+	// Information on compactions the block was created from.
+	Compaction BlockMetaCompaction `json:"compaction"`
+
+	// Version of the index format.
+	Version int `json:"version"`
+}
+
+// BlockStats contains stats about contents of a block.
+type BlockStats struct {
+	NumSamples    uint64 `json:"numSamples,omitempty"`
+	NumSeries     uint64 `json:"numSeries,omitempty"`
+	NumChunks     uint64 `json:"numChunks,omitempty"`
+	NumTombstones uint64 `json:"numTombstones,omitempty"`
+}
+
+// BlockDesc describes a block by ULID and time range.
+type BlockDesc struct {
+	ULID    ulid.ULID `json:"ulid"`
+	MinTime int64     `json:"minTime"`
+	MaxTime int64     `json:"maxTime"`
+}
+
+// BlockMetaCompaction holds information about compactions a block went through.
+type BlockMetaCompaction struct {
+	// Maximum number of compaction cycles any source block has
+	// gone through.
+	Level int `json:"level"`
+	// ULIDs of all source head blocks that went into the block.
+	Sources []ulid.ULID `json:"sources,omitempty"`
+	// Indicates that during compaction it resulted in a block without any samples
+	// so it should be deleted on the next reload.
+	Deletable bool `json:"deletable,omitempty"`
+	// Short descriptions of the direct blocks that were used to create
+	// this block.
+	Parents []BlockDesc `json:"parents,omitempty"`
+	Failed  bool        `json:"failed,omitempty"`
+}
+
+const indexFilename = "index"
+const metaFilename = "meta.json"
+
+func chunkDir(dir string) string { return filepath.Join(dir, "chunks") }
+
+func readMetaFile(dir string) (*BlockMeta, int64, error) {
+	b, err := ioutil.ReadFile(filepath.Join(dir, metaFilename))
+	if err != nil {
+		return nil, 0, err
+	}
+	var m BlockMeta
+
+	if err := json.Unmarshal(b, &m); err != nil {
+		return nil, 0, err
+	}
+	if m.Version != 1 {
+		return nil, 0, errors.Errorf("unexpected meta file version %d", m.Version)
+	}
+
+	return &m, int64(len(b)), nil
+}
+
+func writeMetaFile(logger log.Logger, dir string, meta *BlockMeta) (int64, error) {
+	meta.Version = 1
+
+	// Make any changes to the file appear atomic.
+	path := filepath.Join(dir, metaFilename)
+	tmp := path + ".tmp"
+	defer func() {
+		if err := os.RemoveAll(tmp); err != nil {
+			level.Error(logger).Log("msg", "remove tmp file", "err", err.Error())
+		}
+	}()
+
+	f, err := os.Create(tmp)
+	if err != nil {
+		return 0, err
+	}
+
+	jsonMeta, err := json.MarshalIndent(meta, "", "\t")
+	if err != nil {
+		return 0, err
+	}
+
+	var merr tsdb_errors.MultiError
+	n, err := f.Write(jsonMeta)
+	if err != nil {
+		merr.Add(err)
+		merr.Add(f.Close())
+		return 0, merr.Err()
+	}
+
+	// Force the kernel to persist the file on disk to avoid data loss if the host crashes.
+	if err := f.Sync(); err != nil {
+		merr.Add(err)
+		merr.Add(f.Close())
+		return 0, merr.Err()
+	}
+	if err := f.Close(); err != nil {
+		return 0, err
+	}
+	return int64(n), fileutil.Replace(tmp, path)
+}
+
+// Block represents a directory of time series data covering a continuous time range.
+type Block struct {
+	mtx            sync.RWMutex
+	closing        bool
+	pendingReaders sync.WaitGroup
+
+	dir  string
+	meta BlockMeta
+
+	// Symbol Table Size in bytes.
+	// We maintain this variable to avoid recalculation everytime.
+	symbolTableSize uint64
+
+	chunkr     ChunkReader
+	indexr     IndexReader
+	tombstones TombstoneReader
+
+	logger log.Logger
+
+	numBytesChunks    int64
+	numBytesIndex     int64
+	numBytesTombstone int64
+	numBytesMeta      int64
+}
+
+// OpenBlock opens the block in the directory. It can be passed a chunk pool, which is used
+// to instantiate chunk structs.
+func OpenBlock(logger log.Logger, dir string, pool chunkenc.Pool) (pb *Block, err error) {
+	if logger == nil {
+		logger = log.NewNopLogger()
+	}
+	var closers []io.Closer
+	defer func() {
+		if err != nil {
+			var merr tsdb_errors.MultiError
+			merr.Add(err)
+			merr.Add(closeAll(closers))
+			err = merr.Err()
+		}
+	}()
+	meta, sizeMeta, err := readMetaFile(dir)
+	if err != nil {
+		return nil, err
+	}
+
+	cr, err := chunks.NewDirReader(chunkDir(dir), pool)
+	if err != nil {
+		return nil, err
+	}
+	closers = append(closers, cr)
+
+	ir, err := index.NewFileReader(filepath.Join(dir, indexFilename))
+	if err != nil {
+		return nil, err
+	}
+	closers = append(closers, ir)
+
+	tr, sizeTomb, err := readTombstones(dir)
+	if err != nil {
+		return nil, err
+	}
+	closers = append(closers, tr)
+
+	pb = &Block{
+		dir:               dir,
+		meta:              *meta,
+		chunkr:            cr,
+		indexr:            ir,
+		tombstones:        tr,
+		symbolTableSize:   ir.SymbolTableSize(),
+		logger:            logger,
+		numBytesChunks:    cr.Size(),
+		numBytesIndex:     ir.Size(),
+		numBytesTombstone: sizeTomb,
+		numBytesMeta:      sizeMeta,
+	}
+	return pb, nil
+}
+
+// Close closes the on-disk block. It blocks as long as there are readers reading from the block.
+func (pb *Block) Close() error {
+	pb.mtx.Lock()
+	pb.closing = true
+	pb.mtx.Unlock()
+
+	pb.pendingReaders.Wait()
+
+	var merr tsdb_errors.MultiError
+
+	merr.Add(pb.chunkr.Close())
+	merr.Add(pb.indexr.Close())
+	merr.Add(pb.tombstones.Close())
+
+	return merr.Err()
+}
+
+func (pb *Block) String() string {
+	return pb.meta.ULID.String()
+}
+
+// Dir returns the directory of the block.
+func (pb *Block) Dir() string { return pb.dir }
+
+// Meta returns meta information about the block.
+func (pb *Block) Meta() BlockMeta { return pb.meta }
+
+// MinTime returns the min time of the meta.
+func (pb *Block) MinTime() int64 { return pb.meta.MinTime }
+
+// MaxTime returns the max time of the meta.
+func (pb *Block) MaxTime() int64 { return pb.meta.MaxTime }
+
+// Size returns the number of bytes that the block takes up.
+func (pb *Block) Size() int64 {
+	return pb.numBytesChunks + pb.numBytesIndex + pb.numBytesTombstone + pb.numBytesMeta
+}
+
+// ErrClosing is returned when a block is in the process of being closed.
+var ErrClosing = errors.New("block is closing")
+
+func (pb *Block) startRead() error {
+	pb.mtx.RLock()
+	defer pb.mtx.RUnlock()
+
+	if pb.closing {
+		return ErrClosing
+	}
+	pb.pendingReaders.Add(1)
+	return nil
+}
+
+// Index returns a new IndexReader against the block data.
+func (pb *Block) Index() (IndexReader, error) {
+	if err := pb.startRead(); err != nil {
+		return nil, err
+	}
+	return blockIndexReader{ir: pb.indexr, b: pb}, nil
+}
+
+// Chunks returns a new ChunkReader against the block data.
+func (pb *Block) Chunks() (ChunkReader, error) {
+	if err := pb.startRead(); err != nil {
+		return nil, err
+	}
+	return blockChunkReader{ChunkReader: pb.chunkr, b: pb}, nil
+}
+
+// Tombstones returns a new TombstoneReader against the block data.
+func (pb *Block) Tombstones() (TombstoneReader, error) {
+	if err := pb.startRead(); err != nil {
+		return nil, err
+	}
+	return blockTombstoneReader{TombstoneReader: pb.tombstones, b: pb}, nil
+}
+
+// GetSymbolTableSize returns the Symbol Table Size in the index of this block.
+func (pb *Block) GetSymbolTableSize() uint64 {
+	return pb.symbolTableSize
+}
+
+func (pb *Block) setCompactionFailed() error {
+	pb.meta.Compaction.Failed = true
+	n, err := writeMetaFile(pb.logger, pb.dir, &pb.meta)
+	if err != nil {
+		return err
+	}
+	pb.numBytesMeta = n
+	return nil
+}
+
+type blockIndexReader struct {
+	ir IndexReader
+	b  *Block
+}
+
+func (r blockIndexReader) Symbols() (map[string]struct{}, error) {
+	s, err := r.ir.Symbols()
+	return s, errors.Wrapf(err, "block: %s", r.b.Meta().ULID)
+}
+
+func (r blockIndexReader) LabelValues(names ...string) (index.StringTuples, error) {
+	st, err := r.ir.LabelValues(names...)
+	return st, errors.Wrapf(err, "block: %s", r.b.Meta().ULID)
+}
+
+func (r blockIndexReader) Postings(name, value string) (index.Postings, error) {
+	p, err := r.ir.Postings(name, value)
+	if err != nil {
+		return p, errors.Wrapf(err, "block: %s", r.b.Meta().ULID)
+	}
+	return p, nil
+}
+
+func (r blockIndexReader) SortedPostings(p index.Postings) index.Postings {
+	return r.ir.SortedPostings(p)
+}
+
+func (r blockIndexReader) Series(ref uint64, lset *labels.Labels, chks *[]chunks.Meta) error {
+	if err := r.ir.Series(ref, lset, chks); err != nil {
+		return errors.Wrapf(err, "block: %s", r.b.Meta().ULID)
+	}
+	return nil
+}
+
+func (r blockIndexReader) LabelIndices() ([][]string, error) {
+	ss, err := r.ir.LabelIndices()
+	return ss, errors.Wrapf(err, "block: %s", r.b.Meta().ULID)
+}
+
+func (r blockIndexReader) LabelNames() ([]string, error) {
+	return r.b.LabelNames()
+}
+
+func (r blockIndexReader) Close() error {
+	r.b.pendingReaders.Done()
+	return nil
+}
+
+type blockTombstoneReader struct {
+	TombstoneReader
+	b *Block
+}
+
+func (r blockTombstoneReader) Close() error {
+	r.b.pendingReaders.Done()
+	return nil
+}
+
+type blockChunkReader struct {
+	ChunkReader
+	b *Block
+}
+
+func (r blockChunkReader) Close() error {
+	r.b.pendingReaders.Done()
+	return nil
+}
+
+// Delete matching series between mint and maxt in the block.
+func (pb *Block) Delete(mint, maxt int64, ms ...labels.Matcher) error {
+	pb.mtx.Lock()
+	defer pb.mtx.Unlock()
+
+	if pb.closing {
+		return ErrClosing
+	}
+
+	p, err := PostingsForMatchers(pb.indexr, ms...)
+	if err != nil {
+		return errors.Wrap(err, "select series")
+	}
+
+	ir := pb.indexr
+
+	// Choose only valid postings which have chunks in the time-range.
+	stones := newMemTombstones()
+
+	var lset labels.Labels
+	var chks []chunks.Meta
+
+Outer:
+	for p.Next() {
+		err := ir.Series(p.At(), &lset, &chks)
+		if err != nil {
+			return err
+		}
+
+		for _, chk := range chks {
+			if chk.OverlapsClosedInterval(mint, maxt) {
+				// Delete only until the current values and not beyond.
+				tmin, tmax := clampInterval(mint, maxt, chks[0].MinTime, chks[len(chks)-1].MaxTime)
+				stones.addInterval(p.At(), Interval{tmin, tmax})
+				continue Outer
+			}
+		}
+	}
+
+	if p.Err() != nil {
+		return p.Err()
+	}
+
+	err = pb.tombstones.Iter(func(id uint64, ivs Intervals) error {
+		for _, iv := range ivs {
+			stones.addInterval(id, iv)
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+	pb.tombstones = stones
+	pb.meta.Stats.NumTombstones = pb.tombstones.Total()
+
+	n, err := writeTombstoneFile(pb.logger, pb.dir, pb.tombstones)
+	if err != nil {
+		return err
+	}
+	pb.numBytesTombstone = n
+	n, err = writeMetaFile(pb.logger, pb.dir, &pb.meta)
+	if err != nil {
+		return err
+	}
+	pb.numBytesMeta = n
+	return nil
+}
+
+// CleanTombstones will remove the tombstones and rewrite the block (only if there are any tombstones).
+// If there was a rewrite, then it returns the ULID of the new block written, else nil.
+func (pb *Block) CleanTombstones(dest string, c Compactor) (*ulid.ULID, error) {
+	numStones := 0
+
+	if err := pb.tombstones.Iter(func(id uint64, ivs Intervals) error {
+		numStones += len(ivs)
+		return nil
+	}); err != nil {
+		// This should never happen, as the iteration function only returns nil.
+		panic(err)
+	}
+	if numStones == 0 {
+		return nil, nil
+	}
+
+	meta := pb.Meta()
+	uid, err := c.Write(dest, pb, pb.meta.MinTime, pb.meta.MaxTime, &meta)
+	if err != nil {
+		return nil, err
+	}
+	return &uid, nil
+}
+
+// Snapshot creates snapshot of the block into dir.
+func (pb *Block) Snapshot(dir string) error {
+	blockDir := filepath.Join(dir, pb.meta.ULID.String())
+	if err := os.MkdirAll(blockDir, 0777); err != nil {
+		return errors.Wrap(err, "create snapshot block dir")
+	}
+
+	chunksDir := chunkDir(blockDir)
+	if err := os.MkdirAll(chunksDir, 0777); err != nil {
+		return errors.Wrap(err, "create snapshot chunk dir")
+	}
+
+	// Hardlink meta, index and tombstones
+	for _, fname := range []string{
+		metaFilename,
+		indexFilename,
+		tombstoneFilename,
+	} {
+		if err := os.Link(filepath.Join(pb.dir, fname), filepath.Join(blockDir, fname)); err != nil {
+			return errors.Wrapf(err, "create snapshot %s", fname)
+		}
+	}
+
+	// Hardlink the chunks
+	curChunkDir := chunkDir(pb.dir)
+	files, err := ioutil.ReadDir(curChunkDir)
+	if err != nil {
+		return errors.Wrap(err, "ReadDir the current chunk dir")
+	}
+
+	for _, f := range files {
+		err := os.Link(filepath.Join(curChunkDir, f.Name()), filepath.Join(chunksDir, f.Name()))
+		if err != nil {
+			return errors.Wrap(err, "hardlink a chunk")
+		}
+	}
+
+	return nil
+}
+
+// OverlapsClosedInterval returns true if the block overlaps [mint, maxt].
+func (pb *Block) OverlapsClosedInterval(mint, maxt int64) bool {
+	// The block itself is a half-open interval
+	// [pb.meta.MinTime, pb.meta.MaxTime).
+	return pb.meta.MinTime <= maxt && mint < pb.meta.MaxTime
+}
+
+// LabelNames returns all the unique label names present in the Block in sorted order.
+func (pb *Block) LabelNames() ([]string, error) {
+	return pb.indexr.LabelNames()
+}
+
+func clampInterval(a, b, mint, maxt int64) (int64, int64) {
+	if a < mint {
+		a = mint
+	}
+	if b > maxt {
+		b = maxt
+	}
+	return a, b
+}
--- a/tsdb/block_test.go
+++ b/tsdb/block_test.go
@ -0,0 +1,295 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"context"
+	"encoding/binary"
+
+	"errors"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"path/filepath"
+	"strconv"
+	"testing"
+
+	"github.com/go-kit/kit/log"
+	"github.com/prometheus/tsdb/chunks"
+	"github.com/prometheus/tsdb/labels"
+	"github.com/prometheus/tsdb/testutil"
+	"github.com/prometheus/tsdb/tsdbutil"
+)
+
+// In Prometheus 2.1.0 we had a bug where the meta.json version was falsely bumped
+// to 2. We had a migration in place resetting it to 1 but we should move immediately to
+// version 3 next time to avoid confusion and issues.
+func TestBlockMetaMustNeverBeVersion2(t *testing.T) {
+	dir, err := ioutil.TempDir("", "metaversion")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	_, err = writeMetaFile(log.NewNopLogger(), dir, &BlockMeta{})
+	testutil.Ok(t, err)
+
+	meta, _, err := readMetaFile(dir)
+	testutil.Ok(t, err)
+	testutil.Assert(t, meta.Version != 2, "meta.json version must never be 2")
+}
+
+func TestSetCompactionFailed(t *testing.T) {
+	tmpdir, err := ioutil.TempDir("", "test")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(tmpdir))
+	}()
+
+	blockDir := createBlock(t, tmpdir, genSeries(1, 1, 0, 1))
+	b, err := OpenBlock(nil, blockDir, nil)
+	testutil.Ok(t, err)
+	testutil.Equals(t, false, b.meta.Compaction.Failed)
+	testutil.Ok(t, b.setCompactionFailed())
+	testutil.Equals(t, true, b.meta.Compaction.Failed)
+	testutil.Ok(t, b.Close())
+
+	b, err = OpenBlock(nil, blockDir, nil)
+	testutil.Ok(t, err)
+	testutil.Equals(t, true, b.meta.Compaction.Failed)
+	testutil.Ok(t, b.Close())
+}
+
+func TestCreateBlock(t *testing.T) {
+	tmpdir, err := ioutil.TempDir("", "test")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(tmpdir))
+	}()
+	b, err := OpenBlock(nil, createBlock(t, tmpdir, genSeries(1, 1, 0, 10)), nil)
+	if err == nil {
+		testutil.Ok(t, b.Close())
+	}
+	testutil.Ok(t, err)
+}
+
+func TestCorruptedChunk(t *testing.T) {
+	for name, test := range map[string]struct {
+		corrFunc func(f *os.File) // Func that applies the corruption.
+		expErr   error
+	}{
+		"invalid header size": {
+			func(f *os.File) {
+				err := f.Truncate(1)
+				testutil.Ok(t, err)
+			},
+			errors.New("invalid chunk header in segment 0: invalid size"),
+		},
+		"invalid magic number": {
+			func(f *os.File) {
+				magicChunksOffset := int64(0)
+				_, err := f.Seek(magicChunksOffset, 0)
+				testutil.Ok(t, err)
+
+				// Set invalid magic number.
+				b := make([]byte, chunks.MagicChunksSize)
+				binary.BigEndian.PutUint32(b[:chunks.MagicChunksSize], 0x00000000)
+				n, err := f.Write(b)
+				testutil.Ok(t, err)
+				testutil.Equals(t, chunks.MagicChunksSize, n)
+			},
+			errors.New("invalid magic number 0"),
+		},
+		"invalid chunk format version": {
+			func(f *os.File) {
+				chunksFormatVersionOffset := int64(4)
+				_, err := f.Seek(chunksFormatVersionOffset, 0)
+				testutil.Ok(t, err)
+
+				// Set invalid chunk format version.
+				b := make([]byte, chunks.ChunksFormatVersionSize)
+				b[0] = 0
+				n, err := f.Write(b)
+				testutil.Ok(t, err)
+				testutil.Equals(t, chunks.ChunksFormatVersionSize, n)
+			},
+			errors.New("invalid chunk format version 0"),
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			tmpdir, err := ioutil.TempDir("", "test_open_block_chunk_corrupted")
+			testutil.Ok(t, err)
+			defer func() {
+				testutil.Ok(t, os.RemoveAll(tmpdir))
+			}()
+
+			blockDir := createBlock(t, tmpdir, genSeries(1, 1, 0, 1))
+			files, err := sequenceFiles(chunkDir(blockDir))
+			testutil.Ok(t, err)
+			testutil.Assert(t, len(files) > 0, "No chunk created.")
+
+			f, err := os.OpenFile(files[0], os.O_RDWR, 0666)
+			testutil.Ok(t, err)
+
+			// Apply corruption function.
+			test.corrFunc(f)
+			testutil.Ok(t, f.Close())
+
+			_, err = OpenBlock(nil, blockDir, nil)
+			testutil.Equals(t, test.expErr.Error(), err.Error())
+		})
+	}
+}
+
+// TestBlockSize ensures that the block size is calculated correctly.
+func TestBlockSize(t *testing.T) {
+	tmpdir, err := ioutil.TempDir("", "test_blockSize")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(tmpdir))
+	}()
+
+	var (
+		blockInit    *Block
+		expSizeInit  int64
+		blockDirInit string
+	)
+
+	// Create a block and compare the reported size vs actual disk size.
+	{
+		blockDirInit = createBlock(t, tmpdir, genSeries(10, 1, 1, 100))
+		blockInit, err = OpenBlock(nil, blockDirInit, nil)
+		testutil.Ok(t, err)
+		defer func() {
+			testutil.Ok(t, blockInit.Close())
+		}()
+		expSizeInit = blockInit.Size()
+		actSizeInit := testutil.DirSize(t, blockInit.Dir())
+		testutil.Equals(t, expSizeInit, actSizeInit)
+	}
+
+	// Delete some series and check the sizes again.
+	{
+		testutil.Ok(t, blockInit.Delete(1, 10, labels.NewMustRegexpMatcher("", ".*")))
+		expAfterDelete := blockInit.Size()
+		testutil.Assert(t, expAfterDelete > expSizeInit, "after a delete the block size should be bigger as the tombstone file should grow %v > %v", expAfterDelete, expSizeInit)
+		actAfterDelete := testutil.DirSize(t, blockDirInit)
+		testutil.Ok(t, err)
+		testutil.Equals(t, expAfterDelete, actAfterDelete, "after a delete reported block size doesn't match actual disk size")
+
+		c, err := NewLeveledCompactor(context.Background(), nil, log.NewNopLogger(), []int64{0}, nil)
+		testutil.Ok(t, err)
+		blockDirAfterCompact, err := c.Compact(tmpdir, []string{blockInit.Dir()}, nil)
+		testutil.Ok(t, err)
+		blockAfterCompact, err := OpenBlock(nil, filepath.Join(tmpdir, blockDirAfterCompact.String()), nil)
+		testutil.Ok(t, err)
+		defer func() {
+			testutil.Ok(t, blockAfterCompact.Close())
+		}()
+		expAfterCompact := blockAfterCompact.Size()
+		actAfterCompact := testutil.DirSize(t, blockAfterCompact.Dir())
+		testutil.Assert(t, actAfterDelete > actAfterCompact, "after a delete and compaction the block size should be smaller %v,%v", actAfterDelete, actAfterCompact)
+		testutil.Equals(t, expAfterCompact, actAfterCompact, "after a delete and compaction reported block size doesn't match actual disk size")
+	}
+}
+
+// createBlock creates a block with given set of series and returns its dir.
+func createBlock(tb testing.TB, dir string, series []Series) string {
+	head := createHead(tb, series)
+	compactor, err := NewLeveledCompactor(context.Background(), nil, log.NewNopLogger(), []int64{1000000}, nil)
+	testutil.Ok(tb, err)
+
+	testutil.Ok(tb, os.MkdirAll(dir, 0777))
+
+	// Add +1 millisecond to block maxt because block intervals are half-open: [b.MinTime, b.MaxTime).
+	// Because of this block intervals are always +1 than the total samples it includes.
+	ulid, err := compactor.Write(dir, head, head.MinTime(), head.MaxTime()+1, nil)
+	testutil.Ok(tb, err)
+	return filepath.Join(dir, ulid.String())
+}
+
+func createHead(tb testing.TB, series []Series) *Head {
+	head, err := NewHead(nil, nil, nil, 2*60*60*1000)
+	testutil.Ok(tb, err)
+	defer head.Close()
+
+	app := head.Appender()
+	for _, s := range series {
+		ref := uint64(0)
+		it := s.Iterator()
+		for it.Next() {
+			t, v := it.At()
+			if ref != 0 {
+				err := app.AddFast(ref, t, v)
+				if err == nil {
+					continue
+				}
+			}
+			ref, err = app.Add(s.Labels(), t, v)
+			testutil.Ok(tb, err)
+		}
+		testutil.Ok(tb, it.Err())
+	}
+	err = app.Commit()
+	testutil.Ok(tb, err)
+	return head
+}
+
+const (
+	defaultLabelName  = "labelName"
+	defaultLabelValue = "labelValue"
+)
+
+// genSeries generates series with a given number of labels and values.
+func genSeries(totalSeries, labelCount int, mint, maxt int64) []Series {
+	if totalSeries == 0 || labelCount == 0 {
+		return nil
+	}
+
+	series := make([]Series, totalSeries)
+
+	for i := 0; i < totalSeries; i++ {
+		lbls := make(map[string]string, labelCount)
+		lbls[defaultLabelName] = strconv.Itoa(i)
+		for j := 1; len(lbls) < labelCount; j++ {
+			lbls[defaultLabelName+strconv.Itoa(j)] = defaultLabelValue + strconv.Itoa(j)
+		}
+		samples := make([]tsdbutil.Sample, 0, maxt-mint+1)
+		for t := mint; t < maxt; t++ {
+			samples = append(samples, sample{t: t, v: rand.Float64()})
+		}
+		series[i] = newSeries(lbls, samples)
+	}
+	return series
+}
+
+// populateSeries generates series from given labels, mint and maxt.
+func populateSeries(lbls []map[string]string, mint, maxt int64) []Series {
+	if len(lbls) == 0 {
+		return nil
+	}
+
+	series := make([]Series, 0, len(lbls))
+	for _, lbl := range lbls {
+		if len(lbl) == 0 {
+			continue
+		}
+		samples := make([]tsdbutil.Sample, 0, maxt-mint+1)
+		for t := mint; t <= maxt; t++ {
+			samples = append(samples, sample{t: t, v: rand.Float64()})
+		}
+		series = append(series, newSeries(lbl, samples))
+	}
+	return series
+}
--- a/tsdb/checkpoint.go
+++ b/tsdb/checkpoint.go
@ -0,0 +1,261 @@
+// Copyright 2018 The Prometheus Authors
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"fmt"
+	"io"
+	"io/ioutil"
+	"math"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/pkg/errors"
+	tsdb_errors "github.com/prometheus/tsdb/errors"
+	"github.com/prometheus/tsdb/fileutil"
+	"github.com/prometheus/tsdb/wal"
+)
+
+// CheckpointStats returns stats about a created checkpoint.
+type CheckpointStats struct {
+	DroppedSeries     int
+	DroppedSamples    int
+	DroppedTombstones int
+	TotalSeries       int // Processed series including dropped ones.
+	TotalSamples      int // Processed samples including dropped ones.
+	TotalTombstones   int // Processed tombstones including dropped ones.
+}
+
+// LastCheckpoint returns the directory name and index of the most recent checkpoint.
+// If dir does not contain any checkpoints, ErrNotFound is returned.
+func LastCheckpoint(dir string) (string, int, error) {
+	files, err := ioutil.ReadDir(dir)
+	if err != nil {
+		return "", 0, err
+	}
+	// Traverse list backwards since there may be multiple checkpoints left.
+	for i := len(files) - 1; i >= 0; i-- {
+		fi := files[i]
+
+		if !strings.HasPrefix(fi.Name(), checkpointPrefix) {
+			continue
+		}
+		if !fi.IsDir() {
+			return "", 0, errors.Errorf("checkpoint %s is not a directory", fi.Name())
+		}
+		idx, err := strconv.Atoi(fi.Name()[len(checkpointPrefix):])
+		if err != nil {
+			continue
+		}
+		return filepath.Join(dir, fi.Name()), idx, nil
+	}
+	return "", 0, ErrNotFound
+}
+
+// DeleteCheckpoints deletes all checkpoints in a directory below a given index.
+func DeleteCheckpoints(dir string, maxIndex int) error {
+	var errs tsdb_errors.MultiError
+
+	files, err := ioutil.ReadDir(dir)
+	if err != nil {
+		return err
+	}
+	for _, fi := range files {
+		if !strings.HasPrefix(fi.Name(), checkpointPrefix) {
+			continue
+		}
+		index, err := strconv.Atoi(fi.Name()[len(checkpointPrefix):])
+		if err != nil || index >= maxIndex {
+			continue
+		}
+		if err := os.RemoveAll(filepath.Join(dir, fi.Name())); err != nil {
+			errs.Add(err)
+		}
+	}
+	return errs.Err()
+}
+
+const checkpointPrefix = "checkpoint."
+
+// Checkpoint creates a compacted checkpoint of segments in range [first, last] in the given WAL.
+// It includes the most recent checkpoint if it exists.
+// All series not satisfying keep and samples below mint are dropped.
+//
+// The checkpoint is stored in a directory named checkpoint.N in the same
+// segmented format as the original WAL itself.
+// This makes it easy to read it through the WAL package and concatenate
+// it with the original WAL.
+func Checkpoint(w *wal.WAL, from, to int, keep func(id uint64) bool, mint int64) (*CheckpointStats, error) {
+	stats := &CheckpointStats{}
+	var sgmReader io.ReadCloser
+
+	{
+
+		var sgmRange []wal.SegmentRange
+		dir, idx, err := LastCheckpoint(w.Dir())
+		if err != nil && err != ErrNotFound {
+			return nil, errors.Wrap(err, "find last checkpoint")
+		}
+		last := idx + 1
+		if err == nil {
+			if from > last {
+				return nil, fmt.Errorf("unexpected gap to last checkpoint. expected:%v, requested:%v", last, from)
+			}
+			// Ignore WAL files below the checkpoint. They shouldn't exist to begin with.
+			from = last
+
+			sgmRange = append(sgmRange, wal.SegmentRange{Dir: dir, Last: math.MaxInt32})
+		}
+
+		sgmRange = append(sgmRange, wal.SegmentRange{Dir: w.Dir(), First: from, Last: to})
+		sgmReader, err = wal.NewSegmentsRangeReader(sgmRange...)
+		if err != nil {
+			return nil, errors.Wrap(err, "create segment reader")
+		}
+		defer sgmReader.Close()
+	}
+
+	cpdir := filepath.Join(w.Dir(), fmt.Sprintf(checkpointPrefix+"%06d", to))
+	cpdirtmp := cpdir + ".tmp"
+
+	if err := os.MkdirAll(cpdirtmp, 0777); err != nil {
+		return nil, errors.Wrap(err, "create checkpoint dir")
+	}
+	cp, err := wal.New(nil, nil, cpdirtmp, w.CompressionEnabled())
+	if err != nil {
+		return nil, errors.Wrap(err, "open checkpoint")
+	}
+
+	// Ensures that an early return caused by an error doesn't leave any tmp files.
+	defer func() {
+		cp.Close()
+		os.RemoveAll(cpdirtmp)
+	}()
+
+	r := wal.NewReader(sgmReader)
+
+	var (
+		series  []RefSeries
+		samples []RefSample
+		tstones []Stone
+		dec     RecordDecoder
+		enc     RecordEncoder
+		buf     []byte
+		recs    [][]byte
+	)
+	for r.Next() {
+		series, samples, tstones = series[:0], samples[:0], tstones[:0]
+
+		// We don't reset the buffer since we batch up multiple records
+		// before writing them to the checkpoint.
+		// Remember where the record for this iteration starts.
+		start := len(buf)
+		rec := r.Record()
+
+		switch dec.Type(rec) {
+		case RecordSeries:
+			series, err = dec.Series(rec, series)
+			if err != nil {
+				return nil, errors.Wrap(err, "decode series")
+			}
+			// Drop irrelevant series in place.
+			repl := series[:0]
+			for _, s := range series {
+				if keep(s.Ref) {
+					repl = append(repl, s)
+				}
+			}
+			if len(repl) > 0 {
+				buf = enc.Series(repl, buf)
+			}
+			stats.TotalSeries += len(series)
+			stats.DroppedSeries += len(series) - len(repl)
+
+		case RecordSamples:
+			samples, err = dec.Samples(rec, samples)
+			if err != nil {
+				return nil, errors.Wrap(err, "decode samples")
+			}
+			// Drop irrelevant samples in place.
+			repl := samples[:0]
+			for _, s := range samples {
+				if s.T >= mint {
+					repl = append(repl, s)
+				}
+			}
+			if len(repl) > 0 {
+				buf = enc.Samples(repl, buf)
+			}
+			stats.TotalSamples += len(samples)
+			stats.DroppedSamples += len(samples) - len(repl)
+
+		case RecordTombstones:
+			tstones, err = dec.Tombstones(rec, tstones)
+			if err != nil {
+				return nil, errors.Wrap(err, "decode deletes")
+			}
+			// Drop irrelevant tombstones in place.
+			repl := tstones[:0]
+			for _, s := range tstones {
+				for _, iv := range s.intervals {
+					if iv.Maxt >= mint {
+						repl = append(repl, s)
+						break
+					}
+				}
+			}
+			if len(repl) > 0 {
+				buf = enc.Tombstones(repl, buf)
+			}
+			stats.TotalTombstones += len(tstones)
+			stats.DroppedTombstones += len(tstones) - len(repl)
+
+		default:
+			return nil, errors.New("invalid record type")
+		}
+		if len(buf[start:]) == 0 {
+			continue // All contents discarded.
+		}
+		recs = append(recs, buf[start:])
+
+		// Flush records in 1 MB increments.
+		if len(buf) > 1*1024*1024 {
+			if err := cp.Log(recs...); err != nil {
+				return nil, errors.Wrap(err, "flush records")
+			}
+			buf, recs = buf[:0], recs[:0]
+		}
+	}
+	// If we hit any corruption during checkpointing, repairing is not an option.
+	// The head won't know which series records are lost.
+	if r.Err() != nil {
+		return nil, errors.Wrap(r.Err(), "read segments")
+	}
+
+	// Flush remaining records.
+	if err := cp.Log(recs...); err != nil {
+		return nil, errors.Wrap(err, "flush records")
+	}
+	if err := cp.Close(); err != nil {
+		return nil, errors.Wrap(err, "close checkpoint")
+	}
+	if err := fileutil.Replace(cpdirtmp, cpdir); err != nil {
+		return nil, errors.Wrap(err, "rename checkpoint directory")
+	}
+
+	return stats, nil
+}
--- a/tsdb/checkpoint_test.go
+++ b/tsdb/checkpoint_test.go
@ -0,0 +1,224 @@
+// Copyright 2018 The Prometheus Authors
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/pkg/errors"
+	"github.com/prometheus/tsdb/fileutil"
+	"github.com/prometheus/tsdb/labels"
+	"github.com/prometheus/tsdb/testutil"
+	"github.com/prometheus/tsdb/wal"
+)
+
+func TestLastCheckpoint(t *testing.T) {
+	dir, err := ioutil.TempDir("", "test_checkpoint")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	_, _, err = LastCheckpoint(dir)
+	testutil.Equals(t, ErrNotFound, err)
+
+	testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.0000"), 0777))
+	s, k, err := LastCheckpoint(dir)
+	testutil.Ok(t, err)
+	testutil.Equals(t, filepath.Join(dir, "checkpoint.0000"), s)
+	testutil.Equals(t, 0, k)
+
+	testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.xyz"), 0777))
+	s, k, err = LastCheckpoint(dir)
+	testutil.Ok(t, err)
+	testutil.Equals(t, filepath.Join(dir, "checkpoint.0000"), s)
+	testutil.Equals(t, 0, k)
+
+	testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.1"), 0777))
+	s, k, err = LastCheckpoint(dir)
+	testutil.Ok(t, err)
+	testutil.Equals(t, filepath.Join(dir, "checkpoint.1"), s)
+	testutil.Equals(t, 1, k)
+
+	testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.1000"), 0777))
+	s, k, err = LastCheckpoint(dir)
+	testutil.Ok(t, err)
+	testutil.Equals(t, filepath.Join(dir, "checkpoint.1000"), s)
+	testutil.Equals(t, 1000, k)
+}
+
+func TestDeleteCheckpoints(t *testing.T) {
+	dir, err := ioutil.TempDir("", "test_checkpoint")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	testutil.Ok(t, DeleteCheckpoints(dir, 0))
+
+	testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.00"), 0777))
+	testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.01"), 0777))
+	testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.02"), 0777))
+	testutil.Ok(t, os.MkdirAll(filepath.Join(dir, "checkpoint.03"), 0777))
+
+	testutil.Ok(t, DeleteCheckpoints(dir, 2))
+
+	files, err := fileutil.ReadDir(dir)
+	testutil.Ok(t, err)
+	testutil.Equals(t, []string{"checkpoint.02", "checkpoint.03"}, files)
+}
+
+func TestCheckpoint(t *testing.T) {
+	for _, compress := range []bool{false, true} {
+		t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) {
+			dir, err := ioutil.TempDir("", "test_checkpoint")
+			testutil.Ok(t, err)
+			defer func() {
+				testutil.Ok(t, os.RemoveAll(dir))
+			}()
+
+			var enc RecordEncoder
+			// Create a dummy segment to bump the initial number.
+			seg, err := wal.CreateSegment(dir, 100)
+			testutil.Ok(t, err)
+			testutil.Ok(t, seg.Close())
+
+			// Manually create checkpoint for 99 and earlier.
+			w, err := wal.New(nil, nil, filepath.Join(dir, "checkpoint.0099"), compress)
+			testutil.Ok(t, err)
+
+			// Add some data we expect to be around later.
+			err = w.Log(enc.Series([]RefSeries{
+				{Ref: 0, Labels: labels.FromStrings("a", "b", "c", "0")},
+				{Ref: 1, Labels: labels.FromStrings("a", "b", "c", "1")},
+			}, nil))
+			testutil.Ok(t, err)
+			testutil.Ok(t, w.Close())
+
+			// Start a WAL and write records to it as usual.
+			w, err = wal.NewSize(nil, nil, dir, 64*1024, compress)
+			testutil.Ok(t, err)
+
+			var last int64
+			for i := 0; ; i++ {
+				_, n, err := w.Segments()
+				testutil.Ok(t, err)
+				if n >= 106 {
+					break
+				}
+				// Write some series initially.
+				if i == 0 {
+					b := enc.Series([]RefSeries{
+						{Ref: 2, Labels: labels.FromStrings("a", "b", "c", "2")},
+						{Ref: 3, Labels: labels.FromStrings("a", "b", "c", "3")},
+						{Ref: 4, Labels: labels.FromStrings("a", "b", "c", "4")},
+						{Ref: 5, Labels: labels.FromStrings("a", "b", "c", "5")},
+					}, nil)
+					testutil.Ok(t, w.Log(b))
+				}
+				// Write samples until the WAL has enough segments.
+				// Make them have drifting timestamps within a record to see that they
+				// get filtered properly.
+				b := enc.Samples([]RefSample{
+					{Ref: 0, T: last, V: float64(i)},
+					{Ref: 1, T: last + 10000, V: float64(i)},
+					{Ref: 2, T: last + 20000, V: float64(i)},
+					{Ref: 3, T: last + 30000, V: float64(i)},
+				}, nil)
+				testutil.Ok(t, w.Log(b))
+
+				last += 100
+			}
+			testutil.Ok(t, w.Close())
+
+			_, err = Checkpoint(w, 100, 106, func(x uint64) bool {
+				return x%2 == 0
+			}, last/2)
+			testutil.Ok(t, err)
+			testutil.Ok(t, w.Truncate(107))
+			testutil.Ok(t, DeleteCheckpoints(w.Dir(), 106))
+
+			// Only the new checkpoint should be left.
+			files, err := fileutil.ReadDir(dir)
+			testutil.Ok(t, err)
+			testutil.Equals(t, 1, len(files))
+			testutil.Equals(t, "checkpoint.000106", files[0])
+
+			sr, err := wal.NewSegmentsReader(filepath.Join(dir, "checkpoint.000106"))
+			testutil.Ok(t, err)
+			defer sr.Close()
+
+			var dec RecordDecoder
+			var series []RefSeries
+			r := wal.NewReader(sr)
+
+			for r.Next() {
+				rec := r.Record()
+
+				switch dec.Type(rec) {
+				case RecordSeries:
+					series, err = dec.Series(rec, series)
+					testutil.Ok(t, err)
+				case RecordSamples:
+					samples, err := dec.Samples(rec, nil)
+					testutil.Ok(t, err)
+					for _, s := range samples {
+						testutil.Assert(t, s.T >= last/2, "sample with wrong timestamp")
+					}
+				}
+			}
+			testutil.Ok(t, r.Err())
+			testutil.Equals(t, []RefSeries{
+				{Ref: 0, Labels: labels.FromStrings("a", "b", "c", "0")},
+				{Ref: 2, Labels: labels.FromStrings("a", "b", "c", "2")},
+				{Ref: 4, Labels: labels.FromStrings("a", "b", "c", "4")},
+			}, series)
+		})
+	}
+}
+
+func TestCheckpointNoTmpFolderAfterError(t *testing.T) {
+	// Create a new wal with an invalid records.
+	dir, err := ioutil.TempDir("", "test_checkpoint")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+	w, err := wal.NewSize(nil, nil, dir, 64*1024, false)
+	testutil.Ok(t, err)
+	testutil.Ok(t, w.Log([]byte{99}))
+	w.Close()
+
+	// Run the checkpoint and since the wal contains an invalid records this should return an error.
+	_, err = Checkpoint(w, 0, 1, nil, 0)
+	testutil.NotOk(t, err)
+
+	// Walk the wal dir to make sure there are no tmp folder left behind after the error.
+	err = filepath.Walk(w.Dir(), func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return errors.Wrapf(err, "access err %q: %v\n", path, err)
+		}
+		if info.IsDir() && strings.HasSuffix(info.Name(), ".tmp") {
+			return fmt.Errorf("wal dir contains temporary folder:%s", info.Name())
+		}
+		return nil
+	})
+	testutil.Ok(t, err)
+}
--- a/tsdb/chunkenc/bstream.go
+++ b/tsdb/chunkenc/bstream.go
@ -0,0 +1,200 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The code in this file was largely written by Damian Gryski as part of
+// https://github.com/dgryski/go-tsz and published under the license below.
+// It received minor modifications to suit Prometheus's needs.
+
+// Copyright (c) 2015,2016 Damian Gryski <damian@gryski.com>
+// All rights reserved.
+
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+package chunkenc
+
+import "io"
+
+// bstream is a stream of bits.
+type bstream struct {
+	stream []byte // the data stream
+	count  uint8  // how many bits are valid in current byte
+}
+
+func newBReader(b []byte) bstream {
+	return bstream{stream: b, count: 8}
+}
+
+func (b *bstream) bytes() []byte {
+	return b.stream
+}
+
+type bit bool
+
+const (
+	zero bit = false
+	one  bit = true
+)
+
+func (b *bstream) writeBit(bit bit) {
+	if b.count == 0 {
+		b.stream = append(b.stream, 0)
+		b.count = 8
+	}
+
+	i := len(b.stream) - 1
+
+	if bit {
+		b.stream[i] |= 1 << (b.count - 1)
+	}
+
+	b.count--
+}
+
+func (b *bstream) writeByte(byt byte) {
+	if b.count == 0 {
+		b.stream = append(b.stream, 0)
+		b.count = 8
+	}
+
+	i := len(b.stream) - 1
+
+	// fill up b.b with b.count bits from byt
+	b.stream[i] |= byt >> (8 - b.count)
+
+	b.stream = append(b.stream, 0)
+	i++
+	b.stream[i] = byt << b.count
+}
+
+func (b *bstream) writeBits(u uint64, nbits int) {
+	u <<= (64 - uint(nbits))
+	for nbits >= 8 {
+		byt := byte(u >> 56)
+		b.writeByte(byt)
+		u <<= 8
+		nbits -= 8
+	}
+
+	for nbits > 0 {
+		b.writeBit((u >> 63) == 1)
+		u <<= 1
+		nbits--
+	}
+}
+
+func (b *bstream) readBit() (bit, error) {
+	if len(b.stream) == 0 {
+		return false, io.EOF
+	}
+
+	if b.count == 0 {
+		b.stream = b.stream[1:]
+
+		if len(b.stream) == 0 {
+			return false, io.EOF
+		}
+		b.count = 8
+	}
+
+	d := (b.stream[0] << (8 - b.count)) & 0x80
+	b.count--
+	return d != 0, nil
+}
+
+func (b *bstream) ReadByte() (byte, error) {
+	return b.readByte()
+}
+
+func (b *bstream) readByte() (byte, error) {
+	if len(b.stream) == 0 {
+		return 0, io.EOF
+	}
+
+	if b.count == 0 {
+		b.stream = b.stream[1:]
+
+		if len(b.stream) == 0 {
+			return 0, io.EOF
+		}
+		return b.stream[0], nil
+	}
+
+	if b.count == 8 {
+		b.count = 0
+		return b.stream[0], nil
+	}
+
+	byt := b.stream[0] << (8 - b.count)
+	b.stream = b.stream[1:]
+
+	if len(b.stream) == 0 {
+		return 0, io.EOF
+	}
+
+	// We just advanced the stream and can assume the shift to be 0.
+	byt |= b.stream[0] >> b.count
+
+	return byt, nil
+}
+
+func (b *bstream) readBits(nbits int) (uint64, error) {
+	var u uint64
+
+	for nbits >= 8 {
+		byt, err := b.readByte()
+		if err != nil {
+			return 0, err
+		}
+
+		u = (u << 8) | uint64(byt)
+		nbits -= 8
+	}
+
+	if nbits == 0 {
+		return u, nil
+	}
+
+	if nbits > int(b.count) {
+		u = (u << uint(b.count)) | uint64((b.stream[0]<<(8-b.count))>>(8-b.count))
+		nbits -= int(b.count)
+		b.stream = b.stream[1:]
+
+		if len(b.stream) == 0 {
+			return 0, io.EOF
+		}
+		b.count = 8
+	}
+
+	u = (u << uint(nbits)) | uint64((b.stream[0]<<(8-b.count))>>(8-uint(nbits)))
+	b.count -= uint8(nbits)
+	return u, nil
+}
--- a/tsdb/chunkenc/chunk.go
+++ b/tsdb/chunkenc/chunk.go
@ -0,0 +1,138 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package chunkenc
+
+import (
+	"fmt"
+	"sync"
+
+	"github.com/pkg/errors"
+)
+
+// Encoding is the identifier for a chunk encoding.
+type Encoding uint8
+
+func (e Encoding) String() string {
+	switch e {
+	case EncNone:
+		return "none"
+	case EncXOR:
+		return "XOR"
+	}
+	return "<unknown>"
+}
+
+// The different available chunk encodings.
+const (
+	EncNone Encoding = iota
+	EncXOR
+)
+
+// Chunk holds a sequence of sample pairs that can be iterated over and appended to.
+type Chunk interface {
+	Bytes() []byte
+	Encoding() Encoding
+	Appender() (Appender, error)
+	// The iterator passed as argument is for re-use.
+	// Depending on implementation, the iterator can
+	// be re-used or a new iterator can be allocated.
+	Iterator(Iterator) Iterator
+	NumSamples() int
+}
+
+// Appender adds sample pairs to a chunk.
+type Appender interface {
+	Append(int64, float64)
+}
+
+// Iterator is a simple iterator that can only get the next value.
+type Iterator interface {
+	At() (int64, float64)
+	Err() error
+	Next() bool
+}
+
+// NewNopIterator returns a new chunk iterator that does not hold any data.
+func NewNopIterator() Iterator {
+	return nopIterator{}
+}
+
+type nopIterator struct{}
+
+func (nopIterator) At() (int64, float64) { return 0, 0 }
+func (nopIterator) Next() bool           { return false }
+func (nopIterator) Err() error           { return nil }
+
+// Pool is used to create and reuse chunk references to avoid allocations.
+type Pool interface {
+	Put(Chunk) error
+	Get(e Encoding, b []byte) (Chunk, error)
+}
+
+// pool is a memory pool of chunk objects.
+type pool struct {
+	xor sync.Pool
+}
+
+// NewPool returns a new pool.
+func NewPool() Pool {
+	return &pool{
+		xor: sync.Pool{
+			New: func() interface{} {
+				return &XORChunk{b: bstream{}}
+			},
+		},
+	}
+}
+
+func (p *pool) Get(e Encoding, b []byte) (Chunk, error) {
+	switch e {
+	case EncXOR:
+		c := p.xor.Get().(*XORChunk)
+		c.b.stream = b
+		c.b.count = 0
+		return c, nil
+	}
+	return nil, errors.Errorf("invalid encoding %q", e)
+}
+
+func (p *pool) Put(c Chunk) error {
+	switch c.Encoding() {
+	case EncXOR:
+		xc, ok := c.(*XORChunk)
+		// This may happen often with wrapped chunks. Nothing we can really do about
+		// it but returning an error would cause a lot of allocations again. Thus,
+		// we just skip it.
+		if !ok {
+			return nil
+		}
+		xc.b.stream = nil
+		xc.b.count = 0
+		p.xor.Put(c)
+	default:
+		return errors.Errorf("invalid encoding %q", c.Encoding())
+	}
+	return nil
+}
+
+// FromData returns a chunk from a byte slice of chunk data.
+// This is there so that users of the library can easily create chunks from
+// bytes.
+func FromData(e Encoding, d []byte) (Chunk, error) {
+	switch e {
+	case EncXOR:
+		return &XORChunk{b: bstream{count: 0, stream: d}}, nil
+	}
+	return nil, fmt.Errorf("unknown chunk encoding: %d", e)
+}
--- a/tsdb/chunkenc/chunk_test.go
+++ b/tsdb/chunkenc/chunk_test.go
@ -0,0 +1,202 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package chunkenc
+
+import (
+	"fmt"
+	"io"
+	"math/rand"
+	"reflect"
+	"testing"
+
+	"github.com/prometheus/tsdb/testutil"
+)
+
+type pair struct {
+	t int64
+	v float64
+}
+
+func TestChunk(t *testing.T) {
+	for enc, nc := range map[Encoding]func() Chunk{
+		EncXOR: func() Chunk { return NewXORChunk() },
+	} {
+		t.Run(fmt.Sprintf("%v", enc), func(t *testing.T) {
+			for range make([]struct{}, 1) {
+				c := nc()
+				if err := testChunk(c); err != nil {
+					t.Fatal(err)
+				}
+			}
+		})
+	}
+}
+
+func testChunk(c Chunk) error {
+	app, err := c.Appender()
+	if err != nil {
+		return err
+	}
+
+	var exp []pair
+	var (
+		ts = int64(1234123324)
+		v  = 1243535.123
+	)
+	for i := 0; i < 300; i++ {
+		ts += int64(rand.Intn(10000) + 1)
+		// v = rand.Float64()
+		if i%2 == 0 {
+			v += float64(rand.Intn(1000000))
+		} else {
+			v -= float64(rand.Intn(1000000))
+		}
+
+		// Start with a new appender every 10th sample. This emulates starting
+		// appending to a partially filled chunk.
+		if i%10 == 0 {
+			app, err = c.Appender()
+			if err != nil {
+				return err
+			}
+		}
+
+		app.Append(ts, v)
+		exp = append(exp, pair{t: ts, v: v})
+		// fmt.Println("appended", len(c.Bytes()), c.Bytes())
+	}
+
+	it := c.Iterator(nil)
+	var res []pair
+	for it.Next() {
+		ts, v := it.At()
+		res = append(res, pair{t: ts, v: v})
+	}
+	if it.Err() != nil {
+		return it.Err()
+	}
+	if !reflect.DeepEqual(exp, res) {
+		return fmt.Errorf("unexpected result\n\ngot: %v\n\nexp: %v", res, exp)
+	}
+	return nil
+}
+
+func benchmarkIterator(b *testing.B, newChunk func() Chunk) {
+	var (
+		t = int64(1234123324)
+		v = 1243535.123
+	)
+	var exp []pair
+	for i := 0; i < b.N; i++ {
+		// t += int64(rand.Intn(10000) + 1)
+		t += int64(1000)
+		// v = rand.Float64()
+		v += float64(100)
+		exp = append(exp, pair{t: t, v: v})
+	}
+
+	var chunks []Chunk
+	for i := 0; i < b.N; {
+		c := newChunk()
+
+		a, err := c.Appender()
+		if err != nil {
+			b.Fatalf("get appender: %s", err)
+		}
+		j := 0
+		for _, p := range exp {
+			if j > 250 {
+				break
+			}
+			a.Append(p.t, p.v)
+			i++
+			j++
+		}
+		chunks = append(chunks, c)
+	}
+
+	b.ReportAllocs()
+	b.ResetTimer()
+
+	fmt.Println("num", b.N, "created chunks", len(chunks))
+
+	res := make([]float64, 0, 1024)
+
+	var it Iterator
+	for i := 0; i < len(chunks); i++ {
+		c := chunks[i]
+		it := c.Iterator(it)
+
+		for it.Next() {
+			_, v := it.At()
+			res = append(res, v)
+		}
+		if it.Err() != io.EOF {
+			testutil.Ok(b, it.Err())
+		}
+		res = res[:0]
+	}
+}
+
+func BenchmarkXORIterator(b *testing.B) {
+	benchmarkIterator(b, func() Chunk {
+		return NewXORChunk()
+	})
+}
+
+func BenchmarkXORAppender(b *testing.B) {
+	benchmarkAppender(b, func() Chunk {
+		return NewXORChunk()
+	})
+}
+
+func benchmarkAppender(b *testing.B, newChunk func() Chunk) {
+	var (
+		t = int64(1234123324)
+		v = 1243535.123
+	)
+	var exp []pair
+	for i := 0; i < b.N; i++ {
+		// t += int64(rand.Intn(10000) + 1)
+		t += int64(1000)
+		// v = rand.Float64()
+		v += float64(100)
+		exp = append(exp, pair{t: t, v: v})
+	}
+
+	b.ReportAllocs()
+	b.ResetTimer()
+
+	var chunks []Chunk
+	for i := 0; i < b.N; {
+		c := newChunk()
+
+		a, err := c.Appender()
+		if err != nil {
+			b.Fatalf("get appender: %s", err)
+		}
+		j := 0
+		for _, p := range exp {
+			if j > 250 {
+				break
+			}
+			a.Append(p.t, p.v)
+			i++
+			j++
+		}
+		chunks = append(chunks, c)
+	}
+
+	fmt.Println("num", b.N, "created chunks", len(chunks))
+}
--- a/tsdb/chunkenc/xor.go
+++ b/tsdb/chunkenc/xor.go
@ -0,0 +1,407 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The code in this file was largely written by Damian Gryski as part of
+// https://github.com/dgryski/go-tsz and published under the license below.
+// It was modified to accommodate reading from byte slices without modifying
+// the underlying bytes, which would panic when reading from mmaped
+// read-only byte slices.
+
+// Copyright (c) 2015,2016 Damian Gryski <damian@gryski.com>
+// All rights reserved.
+
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+package chunkenc
+
+import (
+	"encoding/binary"
+	"math"
+	"math/bits"
+)
+
+// XORChunk holds XOR encoded sample data.
+type XORChunk struct {
+	b bstream
+}
+
+// NewXORChunk returns a new chunk with XOR encoding of the given size.
+func NewXORChunk() *XORChunk {
+	b := make([]byte, 2, 128)
+	return &XORChunk{b: bstream{stream: b, count: 0}}
+}
+
+// Encoding returns the encoding type.
+func (c *XORChunk) Encoding() Encoding {
+	return EncXOR
+}
+
+// Bytes returns the underlying byte slice of the chunk.
+func (c *XORChunk) Bytes() []byte {
+	return c.b.bytes()
+}
+
+// NumSamples returns the number of samples in the chunk.
+func (c *XORChunk) NumSamples() int {
+	return int(binary.BigEndian.Uint16(c.Bytes()))
+}
+
+// Appender implements the Chunk interface.
+func (c *XORChunk) Appender() (Appender, error) {
+	it := c.iterator(nil)
+
+	// To get an appender we must know the state it would have if we had
+	// appended all existing data from scratch.
+	// We iterate through the end and populate via the iterator's state.
+	for it.Next() {
+	}
+	if err := it.Err(); err != nil {
+		return nil, err
+	}
+
+	a := &xorAppender{
+		b:        &c.b,
+		t:        it.t,
+		v:        it.val,
+		tDelta:   it.tDelta,
+		leading:  it.leading,
+		trailing: it.trailing,
+	}
+	if binary.BigEndian.Uint16(a.b.bytes()) == 0 {
+		a.leading = 0xff
+	}
+	return a, nil
+}
+
+func (c *XORChunk) iterator(it Iterator) *xorIterator {
+	// Should iterators guarantee to act on a copy of the data so it doesn't lock append?
+	// When using striped locks to guard access to chunks, probably yes.
+	// Could only copy data if the chunk is not completed yet.
+	if xorIter, ok := it.(*xorIterator); ok {
+		xorIter.Reset(c.b.bytes())
+		return xorIter
+	}
+	return &xorIterator{
+		// The first 2 bytes contain chunk headers.
+		// We skip that for actual samples.
+		br:       newBReader(c.b.bytes()[2:]),
+		numTotal: binary.BigEndian.Uint16(c.b.bytes()),
+	}
+}
+
+// Iterator implements the Chunk interface.
+func (c *XORChunk) Iterator(it Iterator) Iterator {
+	return c.iterator(it)
+}
+
+type xorAppender struct {
+	b *bstream
+
+	t      int64
+	v      float64
+	tDelta uint64
+
+	leading  uint8
+	trailing uint8
+}
+
+func (a *xorAppender) Append(t int64, v float64) {
+	var tDelta uint64
+	num := binary.BigEndian.Uint16(a.b.bytes())
+
+	if num == 0 {
+		buf := make([]byte, binary.MaxVarintLen64)
+		for _, b := range buf[:binary.PutVarint(buf, t)] {
+			a.b.writeByte(b)
+		}
+		a.b.writeBits(math.Float64bits(v), 64)
+
+	} else if num == 1 {
+		tDelta = uint64(t - a.t)
+
+		buf := make([]byte, binary.MaxVarintLen64)
+		for _, b := range buf[:binary.PutUvarint(buf, tDelta)] {
+			a.b.writeByte(b)
+		}
+
+		a.writeVDelta(v)
+
+	} else {
+		tDelta = uint64(t - a.t)
+		dod := int64(tDelta - a.tDelta)
+
+		// Gorilla has a max resolution of seconds, Prometheus milliseconds.
+		// Thus we use higher value range steps with larger bit size.
+		switch {
+		case dod == 0:
+			a.b.writeBit(zero)
+		case bitRange(dod, 14):
+			a.b.writeBits(0x02, 2) // '10'
+			a.b.writeBits(uint64(dod), 14)
+		case bitRange(dod, 17):
+			a.b.writeBits(0x06, 3) // '110'
+			a.b.writeBits(uint64(dod), 17)
+		case bitRange(dod, 20):
+			a.b.writeBits(0x0e, 4) // '1110'
+			a.b.writeBits(uint64(dod), 20)
+		default:
+			a.b.writeBits(0x0f, 4) // '1111'
+			a.b.writeBits(uint64(dod), 64)
+		}
+
+		a.writeVDelta(v)
+	}
+
+	a.t = t
+	a.v = v
+	binary.BigEndian.PutUint16(a.b.bytes(), num+1)
+	a.tDelta = tDelta
+}
+
+func bitRange(x int64, nbits uint8) bool {
+	return -((1<<(nbits-1))-1) <= x && x <= 1<<(nbits-1)
+}
+
+func (a *xorAppender) writeVDelta(v float64) {
+	vDelta := math.Float64bits(v) ^ math.Float64bits(a.v)
+
+	if vDelta == 0 {
+		a.b.writeBit(zero)
+		return
+	}
+	a.b.writeBit(one)
+
+	leading := uint8(bits.LeadingZeros64(vDelta))
+	trailing := uint8(bits.TrailingZeros64(vDelta))
+
+	// Clamp number of leading zeros to avoid overflow when encoding.
+	if leading >= 32 {
+		leading = 31
+	}
+
+	if a.leading != 0xff && leading >= a.leading && trailing >= a.trailing {
+		a.b.writeBit(zero)
+		a.b.writeBits(vDelta>>a.trailing, 64-int(a.leading)-int(a.trailing))
+	} else {
+		a.leading, a.trailing = leading, trailing
+
+		a.b.writeBit(one)
+		a.b.writeBits(uint64(leading), 5)
+
+		// Note that if leading == trailing == 0, then sigbits == 64.  But that value doesn't actually fit into the 6 bits we have.
+		// Luckily, we never need to encode 0 significant bits, since that would put us in the other case (vdelta == 0).
+		// So instead we write out a 0 and adjust it back to 64 on unpacking.
+		sigbits := 64 - leading - trailing
+		a.b.writeBits(uint64(sigbits), 6)
+		a.b.writeBits(vDelta>>trailing, int(sigbits))
+	}
+}
+
+type xorIterator struct {
+	br       bstream
+	numTotal uint16
+	numRead  uint16
+
+	t   int64
+	val float64
+
+	leading  uint8
+	trailing uint8
+
+	tDelta uint64
+	err    error
+}
+
+func (it *xorIterator) At() (int64, float64) {
+	return it.t, it.val
+}
+
+func (it *xorIterator) Err() error {
+	return it.err
+}
+
+func (it *xorIterator) Reset(b []byte) {
+	// The first 2 bytes contain chunk headers.
+	// We skip that for actual samples.
+	it.br = newBReader(b[2:])
+	it.numTotal = binary.BigEndian.Uint16(b)
+
+	it.numRead = 0
+	it.t = 0
+	it.val = 0
+	it.leading = 0
+	it.trailing = 0
+	it.tDelta = 0
+	it.err = nil
+}
+
+func (it *xorIterator) Next() bool {
+	if it.err != nil || it.numRead == it.numTotal {
+		return false
+	}
+
+	if it.numRead == 0 {
+		t, err := binary.ReadVarint(&it.br)
+		if err != nil {
+			it.err = err
+			return false
+		}
+		v, err := it.br.readBits(64)
+		if err != nil {
+			it.err = err
+			return false
+		}
+		it.t = t
+		it.val = math.Float64frombits(v)
+
+		it.numRead++
+		return true
+	}
+	if it.numRead == 1 {
+		tDelta, err := binary.ReadUvarint(&it.br)
+		if err != nil {
+			it.err = err
+			return false
+		}
+		it.tDelta = tDelta
+		it.t = it.t + int64(it.tDelta)
+
+		return it.readValue()
+	}
+
+	var d byte
+	// read delta-of-delta
+	for i := 0; i < 4; i++ {
+		d <<= 1
+		bit, err := it.br.readBit()
+		if err != nil {
+			it.err = err
+			return false
+		}
+		if bit == zero {
+			break
+		}
+		d |= 1
+	}
+	var sz uint8
+	var dod int64
+	switch d {
+	case 0x00:
+		// dod == 0
+	case 0x02:
+		sz = 14
+	case 0x06:
+		sz = 17
+	case 0x0e:
+		sz = 20
+	case 0x0f:
+		bits, err := it.br.readBits(64)
+		if err != nil {
+			it.err = err
+			return false
+		}
+
+		dod = int64(bits)
+	}
+
+	if sz != 0 {
+		bits, err := it.br.readBits(int(sz))
+		if err != nil {
+			it.err = err
+			return false
+		}
+		if bits > (1 << (sz - 1)) {
+			// or something
+			bits = bits - (1 << sz)
+		}
+		dod = int64(bits)
+	}
+
+	it.tDelta = uint64(int64(it.tDelta) + dod)
+	it.t = it.t + int64(it.tDelta)
+
+	return it.readValue()
+}
+
+func (it *xorIterator) readValue() bool {
+	bit, err := it.br.readBit()
+	if err != nil {
+		it.err = err
+		return false
+	}
+
+	if bit == zero {
+		// it.val = it.val
+	} else {
+		bit, err := it.br.readBit()
+		if err != nil {
+			it.err = err
+			return false
+		}
+		if bit == zero {
+			// reuse leading/trailing zero bits
+			// it.leading, it.trailing = it.leading, it.trailing
+		} else {
+			bits, err := it.br.readBits(5)
+			if err != nil {
+				it.err = err
+				return false
+			}
+			it.leading = uint8(bits)
+
+			bits, err = it.br.readBits(6)
+			if err != nil {
+				it.err = err
+				return false
+			}
+			mbits := uint8(bits)
+			// 0 significant bits here means we overflowed and we actually need 64; see comment in encoder
+			if mbits == 0 {
+				mbits = 64
+			}
+			it.trailing = 64 - it.leading - mbits
+		}
+
+		mbits := int(64 - it.leading - it.trailing)
+		bits, err := it.br.readBits(mbits)
+		if err != nil {
+			it.err = err
+			return false
+		}
+		vbits := math.Float64bits(it.val)
+		vbits ^= (bits << it.trailing)
+		it.val = math.Float64frombits(vbits)
+	}
+
+	it.numRead++
+	return true
+}
--- a/tsdb/chunks/chunks.go
+++ b/tsdb/chunks/chunks.go
@ -0,0 +1,512 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package chunks
+
+import (
+	"bufio"
+	"encoding/binary"
+	"fmt"
+	"hash"
+	"hash/crc32"
+	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strconv"
+
+	"github.com/pkg/errors"
+	"github.com/prometheus/tsdb/chunkenc"
+	tsdb_errors "github.com/prometheus/tsdb/errors"
+	"github.com/prometheus/tsdb/fileutil"
+)
+
+const (
+	// MagicChunks is 4 bytes at the head of a series file.
+	MagicChunks = 0x85BD40DD
+	// MagicChunksSize is the size in bytes of MagicChunks.
+	MagicChunksSize = 4
+
+	chunksFormatV1          = 1
+	ChunksFormatVersionSize = 1
+
+	chunkHeaderSize = MagicChunksSize + ChunksFormatVersionSize
+)
+
+// Meta holds information about a chunk of data.
+type Meta struct {
+	// Ref and Chunk hold either a reference that can be used to retrieve
+	// chunk data or the data itself.
+	// Generally, only one of them is set.
+	Ref   uint64
+	Chunk chunkenc.Chunk
+
+	// Time range the data covers.
+	// When MaxTime == math.MaxInt64 the chunk is still open and being appended to.
+	MinTime, MaxTime int64
+}
+
+// writeHash writes the chunk encoding and raw data into the provided hash.
+func (cm *Meta) writeHash(h hash.Hash, buf []byte) error {
+	buf = append(buf[:0], byte(cm.Chunk.Encoding()))
+	if _, err := h.Write(buf[:1]); err != nil {
+		return err
+	}
+	if _, err := h.Write(cm.Chunk.Bytes()); err != nil {
+		return err
+	}
+	return nil
+}
+
+// OverlapsClosedInterval Returns true if the chunk overlaps [mint, maxt].
+func (cm *Meta) OverlapsClosedInterval(mint, maxt int64) bool {
+	// The chunk itself is a closed interval [cm.MinTime, cm.MaxTime].
+	return cm.MinTime <= maxt && mint <= cm.MaxTime
+}
+
+var (
+	errInvalidSize = fmt.Errorf("invalid size")
+)
+
+var castagnoliTable *crc32.Table
+
+func init() {
+	castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
+}
+
+// newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the
+// polynomial may be easily changed in one location at a later time, if necessary.
+func newCRC32() hash.Hash32 {
+	return crc32.New(castagnoliTable)
+}
+
+// Writer implements the ChunkWriter interface for the standard
+// serialization format.
+type Writer struct {
+	dirFile *os.File
+	files   []*os.File
+	wbuf    *bufio.Writer
+	n       int64
+	crc32   hash.Hash
+	buf     [binary.MaxVarintLen32]byte
+
+	segmentSize int64
+}
+
+const (
+	defaultChunkSegmentSize = 512 * 1024 * 1024
+)
+
+// NewWriter returns a new writer against the given directory.
+func NewWriter(dir string) (*Writer, error) {
+	if err := os.MkdirAll(dir, 0777); err != nil {
+		return nil, err
+	}
+	dirFile, err := fileutil.OpenDir(dir)
+	if err != nil {
+		return nil, err
+	}
+	cw := &Writer{
+		dirFile:     dirFile,
+		n:           0,
+		crc32:       newCRC32(),
+		segmentSize: defaultChunkSegmentSize,
+	}
+	return cw, nil
+}
+
+func (w *Writer) tail() *os.File {
+	if len(w.files) == 0 {
+		return nil
+	}
+	return w.files[len(w.files)-1]
+}
+
+// finalizeTail writes all pending data to the current tail file,
+// truncates its size, and closes it.
+func (w *Writer) finalizeTail() error {
+	tf := w.tail()
+	if tf == nil {
+		return nil
+	}
+
+	if err := w.wbuf.Flush(); err != nil {
+		return err
+	}
+	if err := tf.Sync(); err != nil {
+		return err
+	}
+	// As the file was pre-allocated, we truncate any superfluous zero bytes.
+	off, err := tf.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return err
+	}
+	if err := tf.Truncate(off); err != nil {
+		return err
+	}
+
+	return tf.Close()
+}
+
+func (w *Writer) cut() error {
+	// Sync current tail to disk and close.
+	if err := w.finalizeTail(); err != nil {
+		return err
+	}
+
+	p, _, err := nextSequenceFile(w.dirFile.Name())
+	if err != nil {
+		return err
+	}
+	f, err := os.OpenFile(p, os.O_WRONLY|os.O_CREATE, 0666)
+	if err != nil {
+		return err
+	}
+	if err = fileutil.Preallocate(f, w.segmentSize, true); err != nil {
+		return err
+	}
+	if err = w.dirFile.Sync(); err != nil {
+		return err
+	}
+
+	// Write header metadata for new file.
+	metab := make([]byte, 8)
+	binary.BigEndian.PutUint32(metab[:MagicChunksSize], MagicChunks)
+	metab[4] = chunksFormatV1
+
+	if _, err := f.Write(metab); err != nil {
+		return err
+	}
+
+	w.files = append(w.files, f)
+	if w.wbuf != nil {
+		w.wbuf.Reset(f)
+	} else {
+		w.wbuf = bufio.NewWriterSize(f, 8*1024*1024)
+	}
+	w.n = 8
+
+	return nil
+}
+
+func (w *Writer) write(b []byte) error {
+	n, err := w.wbuf.Write(b)
+	w.n += int64(n)
+	return err
+}
+
+// MergeOverlappingChunks removes the samples whose timestamp is overlapping.
+// The last appearing sample is retained in case there is overlapping.
+// This assumes that `chks []Meta` is sorted w.r.t. MinTime.
+func MergeOverlappingChunks(chks []Meta) ([]Meta, error) {
+	if len(chks) < 2 {
+		return chks, nil
+	}
+	newChks := make([]Meta, 0, len(chks)) // Will contain the merged chunks.
+	newChks = append(newChks, chks[0])
+	last := 0
+	for _, c := range chks[1:] {
+		// We need to check only the last chunk in newChks.
+		// Reason: (1) newChks[last-1].MaxTime < newChks[last].MinTime (non overlapping)
+		//         (2) As chks are sorted w.r.t. MinTime, newChks[last].MinTime < c.MinTime.
+		// So never overlaps with newChks[last-1] or anything before that.
+		if c.MinTime > newChks[last].MaxTime {
+			newChks = append(newChks, c)
+			last++
+			continue
+		}
+		nc := &newChks[last]
+		if c.MaxTime > nc.MaxTime {
+			nc.MaxTime = c.MaxTime
+		}
+		chk, err := MergeChunks(nc.Chunk, c.Chunk)
+		if err != nil {
+			return nil, err
+		}
+		nc.Chunk = chk
+	}
+
+	return newChks, nil
+}
+
+// MergeChunks vertically merges a and b, i.e., if there is any sample
+// with same timestamp in both a and b, the sample in a is discarded.
+func MergeChunks(a, b chunkenc.Chunk) (*chunkenc.XORChunk, error) {
+	newChunk := chunkenc.NewXORChunk()
+	app, err := newChunk.Appender()
+	if err != nil {
+		return nil, err
+	}
+	ait := a.Iterator(nil)
+	bit := b.Iterator(nil)
+	aok, bok := ait.Next(), bit.Next()
+	for aok && bok {
+		at, av := ait.At()
+		bt, bv := bit.At()
+		if at < bt {
+			app.Append(at, av)
+			aok = ait.Next()
+		} else if bt < at {
+			app.Append(bt, bv)
+			bok = bit.Next()
+		} else {
+			app.Append(bt, bv)
+			aok = ait.Next()
+			bok = bit.Next()
+		}
+	}
+	for aok {
+		at, av := ait.At()
+		app.Append(at, av)
+		aok = ait.Next()
+	}
+	for bok {
+		bt, bv := bit.At()
+		app.Append(bt, bv)
+		bok = bit.Next()
+	}
+	if ait.Err() != nil {
+		return nil, ait.Err()
+	}
+	if bit.Err() != nil {
+		return nil, bit.Err()
+	}
+	return newChunk, nil
+}
+
+func (w *Writer) WriteChunks(chks ...Meta) error {
+	// Calculate maximum space we need and cut a new segment in case
+	// we don't fit into the current one.
+	maxLen := int64(binary.MaxVarintLen32) // The number of chunks.
+	for _, c := range chks {
+		maxLen += binary.MaxVarintLen32 + 1 // The number of bytes in the chunk and its encoding.
+		maxLen += int64(len(c.Chunk.Bytes()))
+		maxLen += 4 // The 4 bytes of crc32
+	}
+	newsz := w.n + maxLen
+
+	if w.wbuf == nil || newsz > w.segmentSize && maxLen <= w.segmentSize {
+		if err := w.cut(); err != nil {
+			return err
+		}
+	}
+
+	var seq = uint64(w.seq()) << 32
+	for i := range chks {
+		chk := &chks[i]
+
+		chk.Ref = seq | uint64(w.n)
+
+		n := binary.PutUvarint(w.buf[:], uint64(len(chk.Chunk.Bytes())))
+
+		if err := w.write(w.buf[:n]); err != nil {
+			return err
+		}
+		w.buf[0] = byte(chk.Chunk.Encoding())
+		if err := w.write(w.buf[:1]); err != nil {
+			return err
+		}
+		if err := w.write(chk.Chunk.Bytes()); err != nil {
+			return err
+		}
+
+		w.crc32.Reset()
+		if err := chk.writeHash(w.crc32, w.buf[:]); err != nil {
+			return err
+		}
+		if err := w.write(w.crc32.Sum(w.buf[:0])); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (w *Writer) seq() int {
+	return len(w.files) - 1
+}
+
+func (w *Writer) Close() error {
+	if err := w.finalizeTail(); err != nil {
+		return err
+	}
+
+	// close dir file (if not windows platform will fail on rename)
+	return w.dirFile.Close()
+}
+
+// ByteSlice abstracts a byte slice.
+type ByteSlice interface {
+	Len() int
+	Range(start, end int) []byte
+}
+
+type realByteSlice []byte
+
+func (b realByteSlice) Len() int {
+	return len(b)
+}
+
+func (b realByteSlice) Range(start, end int) []byte {
+	return b[start:end]
+}
+
+func (b realByteSlice) Sub(start, end int) ByteSlice {
+	return b[start:end]
+}
+
+// Reader implements a ChunkReader for a serialized byte stream
+// of series data.
+type Reader struct {
+	bs   []ByteSlice // The underlying bytes holding the encoded series data.
+	cs   []io.Closer // Closers for resources behind the byte slices.
+	size int64       // The total size of bytes in the reader.
+	pool chunkenc.Pool
+}
+
+func newReader(bs []ByteSlice, cs []io.Closer, pool chunkenc.Pool) (*Reader, error) {
+	cr := Reader{pool: pool, bs: bs, cs: cs}
+	var totalSize int64
+
+	for i, b := range cr.bs {
+		if b.Len() < chunkHeaderSize {
+			return nil, errors.Wrapf(errInvalidSize, "invalid chunk header in segment %d", i)
+		}
+		// Verify magic number.
+		if m := binary.BigEndian.Uint32(b.Range(0, MagicChunksSize)); m != MagicChunks {
+			return nil, errors.Errorf("invalid magic number %x", m)
+		}
+
+		// Verify chunk format version.
+		if v := int(b.Range(MagicChunksSize, MagicChunksSize+ChunksFormatVersionSize)[0]); v != chunksFormatV1 {
+			return nil, errors.Errorf("invalid chunk format version %d", v)
+		}
+		totalSize += int64(b.Len())
+	}
+	cr.size = totalSize
+	return &cr, nil
+}
+
+// NewDirReader returns a new Reader against sequentially numbered files in the
+// given directory.
+func NewDirReader(dir string, pool chunkenc.Pool) (*Reader, error) {
+	files, err := sequenceFiles(dir)
+	if err != nil {
+		return nil, err
+	}
+	if pool == nil {
+		pool = chunkenc.NewPool()
+	}
+
+	var (
+		bs   []ByteSlice
+		cs   []io.Closer
+		merr tsdb_errors.MultiError
+	)
+	for _, fn := range files {
+		f, err := fileutil.OpenMmapFile(fn)
+		if err != nil {
+			merr.Add(errors.Wrap(err, "mmap files"))
+			merr.Add(closeAll(cs))
+			return nil, merr
+		}
+		cs = append(cs, f)
+		bs = append(bs, realByteSlice(f.Bytes()))
+	}
+
+	reader, err := newReader(bs, cs, pool)
+	if err != nil {
+		merr.Add(err)
+		merr.Add(closeAll(cs))
+		return nil, merr
+	}
+	return reader, nil
+}
+
+func (s *Reader) Close() error {
+	return closeAll(s.cs)
+}
+
+// Size returns the size of the chunks.
+func (s *Reader) Size() int64 {
+	return s.size
+}
+
+// Chunk returns a chunk from a given reference.
+func (s *Reader) Chunk(ref uint64) (chunkenc.Chunk, error) {
+	var (
+		sgmSeq    = int(ref >> 32)
+		sgmOffset = int((ref << 32) >> 32)
+	)
+	if sgmSeq >= len(s.bs) {
+		return nil, errors.Errorf("reference sequence %d out of range", sgmSeq)
+	}
+	chkS := s.bs[sgmSeq]
+
+	if sgmOffset >= chkS.Len() {
+		return nil, errors.Errorf("offset %d beyond data size %d", sgmOffset, chkS.Len())
+	}
+	// With the minimum chunk length this should never cause us reading
+	// over the end of the slice.
+	chk := chkS.Range(sgmOffset, sgmOffset+binary.MaxVarintLen32)
+
+	chkLen, n := binary.Uvarint(chk)
+	if n <= 0 {
+		return nil, errors.Errorf("reading chunk length failed with %d", n)
+	}
+	chk = chkS.Range(sgmOffset+n, sgmOffset+n+1+int(chkLen))
+
+	return s.pool.Get(chunkenc.Encoding(chk[0]), chk[1:1+chkLen])
+}
+
+func nextSequenceFile(dir string) (string, int, error) {
+	names, err := fileutil.ReadDir(dir)
+	if err != nil {
+		return "", 0, err
+	}
+
+	i := uint64(0)
+	for _, n := range names {
+		j, err := strconv.ParseUint(n, 10, 64)
+		if err != nil {
+			continue
+		}
+		i = j
+	}
+	return filepath.Join(dir, fmt.Sprintf("%0.6d", i+1)), int(i + 1), nil
+}
+
+func sequenceFiles(dir string) ([]string, error) {
+	files, err := ioutil.ReadDir(dir)
+	if err != nil {
+		return nil, err
+	}
+	var res []string
+
+	for _, fi := range files {
+		if _, err := strconv.ParseUint(fi.Name(), 10, 64); err != nil {
+			continue
+		}
+		res = append(res, filepath.Join(dir, fi.Name()))
+	}
+	return res, nil
+}
+
+func closeAll(cs []io.Closer) error {
+	var merr tsdb_errors.MultiError
+
+	for _, c := range cs {
+		merr.Add(c.Close())
+	}
+	return merr.Err()
+}
--- a/tsdb/chunks/chunks_test.go
+++ b/tsdb/chunks/chunks_test.go
@ -0,0 +1,28 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package chunks
+
+import (
+	"testing"
+
+	"github.com/prometheus/tsdb/testutil"
+)
+
+func TestReaderWithInvalidBuffer(t *testing.T) {
+	b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81})
+	r := &Reader{bs: []ByteSlice{b}}
+
+	_, err := r.Chunk(0)
+	testutil.NotOk(t, err)
+}
--- a/tsdb/cmd/tsdb/.gitignore
+++ b/tsdb/cmd/tsdb/.gitignore
@ -0,0 +1,3 @@
+testdata*
+tsdb
+benchout
--- a/tsdb/cmd/tsdb/README.md
+++ b/tsdb/cmd/tsdb/README.md
@ -0,0 +1,3 @@
+TODO:
+- [ ] add tabular output
+- [ ] break commands in separate files
--- a/tsdb/cmd/tsdb/main.go
+++ b/tsdb/cmd/tsdb/main.go
@ -0,0 +1,653 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"math"
+	"os"
+	"path/filepath"
+	"runtime"
+	"runtime/pprof"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"text/tabwriter"
+	"time"
+
+	"github.com/go-kit/kit/log"
+	"github.com/pkg/errors"
+	"github.com/prometheus/tsdb"
+	"github.com/prometheus/tsdb/chunks"
+	tsdb_errors "github.com/prometheus/tsdb/errors"
+	"github.com/prometheus/tsdb/labels"
+	"gopkg.in/alecthomas/kingpin.v2"
+)
+
+func main() {
+	if err := execute(); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(1)
+	}
+}
+
+func execute() (err error) {
+	var (
+		defaultDBPath = filepath.Join("benchout", "storage")
+
+		cli                  = kingpin.New(filepath.Base(os.Args[0]), "CLI tool for tsdb")
+		benchCmd             = cli.Command("bench", "run benchmarks")
+		benchWriteCmd        = benchCmd.Command("write", "run a write performance benchmark")
+		benchWriteOutPath    = benchWriteCmd.Flag("out", "set the output path").Default("benchout").String()
+		benchWriteNumMetrics = benchWriteCmd.Flag("metrics", "number of metrics to read").Default("10000").Int()
+		benchSamplesFile     = benchWriteCmd.Arg("file", "input file with samples data, default is ("+filepath.Join("..", "..", "testdata", "20kseries.json")+")").Default(filepath.Join("..", "..", "testdata", "20kseries.json")).String()
+		listCmd              = cli.Command("ls", "list db blocks")
+		listCmdHumanReadable = listCmd.Flag("human-readable", "print human readable values").Short('h').Bool()
+		listPath             = listCmd.Arg("db path", "database path (default is "+defaultDBPath+")").Default(defaultDBPath).String()
+		analyzeCmd           = cli.Command("analyze", "analyze churn, label pair cardinality.")
+		analyzePath          = analyzeCmd.Arg("db path", "database path (default is "+defaultDBPath+")").Default(defaultDBPath).String()
+		analyzeBlockID       = analyzeCmd.Arg("block id", "block to analyze (default is the last block)").String()
+		analyzeLimit         = analyzeCmd.Flag("limit", "how many items to show in each list").Default("20").Int()
+		dumpCmd              = cli.Command("dump", "dump samples from a TSDB")
+		dumpPath             = dumpCmd.Arg("db path", "database path (default is "+defaultDBPath+")").Default(defaultDBPath).String()
+		dumpMinTime          = dumpCmd.Flag("min-time", "minimum timestamp to dump").Default(strconv.FormatInt(math.MinInt64, 10)).Int64()
+		dumpMaxTime          = dumpCmd.Flag("max-time", "maximum timestamp to dump").Default(strconv.FormatInt(math.MaxInt64, 10)).Int64()
+	)
+
+	logger := log.NewLogfmtLogger(log.NewSyncWriter(os.Stderr))
+	var merr tsdb_errors.MultiError
+
+	switch kingpin.MustParse(cli.Parse(os.Args[1:])) {
+	case benchWriteCmd.FullCommand():
+		wb := &writeBenchmark{
+			outPath:     *benchWriteOutPath,
+			numMetrics:  *benchWriteNumMetrics,
+			samplesFile: *benchSamplesFile,
+			logger:      logger,
+		}
+		return wb.run()
+	case listCmd.FullCommand():
+		db, err := tsdb.OpenDBReadOnly(*listPath, nil)
+		if err != nil {
+			return err
+		}
+		defer func() {
+			merr.Add(err)
+			merr.Add(db.Close())
+			err = merr.Err()
+		}()
+		blocks, err := db.Blocks()
+		if err != nil {
+			return err
+		}
+		printBlocks(blocks, listCmdHumanReadable)
+	case analyzeCmd.FullCommand():
+		db, err := tsdb.OpenDBReadOnly(*analyzePath, nil)
+		if err != nil {
+			return err
+		}
+		defer func() {
+			merr.Add(err)
+			merr.Add(db.Close())
+			err = merr.Err()
+		}()
+		blocks, err := db.Blocks()
+		if err != nil {
+			return err
+		}
+		var block tsdb.BlockReader
+		if *analyzeBlockID != "" {
+			for _, b := range blocks {
+				if b.Meta().ULID.String() == *analyzeBlockID {
+					block = b
+					break
+				}
+			}
+		} else if len(blocks) > 0 {
+			block = blocks[len(blocks)-1]
+		}
+		if block == nil {
+			return fmt.Errorf("block not found")
+		}
+		return analyzeBlock(block, *analyzeLimit)
+	case dumpCmd.FullCommand():
+		db, err := tsdb.OpenDBReadOnly(*dumpPath, nil)
+		if err != nil {
+			return err
+		}
+		defer func() {
+			merr.Add(err)
+			merr.Add(db.Close())
+			err = merr.Err()
+		}()
+		return dumpSamples(db, *dumpMinTime, *dumpMaxTime)
+	}
+	return nil
+}
+
+type writeBenchmark struct {
+	outPath     string
+	samplesFile string
+	cleanup     bool
+	numMetrics  int
+
+	storage *tsdb.DB
+
+	cpuprof   *os.File
+	memprof   *os.File
+	blockprof *os.File
+	mtxprof   *os.File
+	logger    log.Logger
+}
+
+func (b *writeBenchmark) run() error {
+	if b.outPath == "" {
+		dir, err := ioutil.TempDir("", "tsdb_bench")
+		if err != nil {
+			return err
+		}
+		b.outPath = dir
+		b.cleanup = true
+	}
+	if err := os.RemoveAll(b.outPath); err != nil {
+		return err
+	}
+	if err := os.MkdirAll(b.outPath, 0777); err != nil {
+		return err
+	}
+
+	dir := filepath.Join(b.outPath, "storage")
+
+	l := log.With(b.logger, "ts", log.DefaultTimestampUTC, "caller", log.DefaultCaller)
+
+	st, err := tsdb.Open(dir, l, nil, &tsdb.Options{
+		RetentionDuration: 15 * 24 * 60 * 60 * 1000, // 15 days in milliseconds
+		BlockRanges:       tsdb.ExponentialBlockRanges(2*60*60*1000, 5, 3),
+	})
+	if err != nil {
+		return err
+	}
+	b.storage = st
+
+	var labels []labels.Labels
+
+	_, err = measureTime("readData", func() error {
+		f, err := os.Open(b.samplesFile)
+		if err != nil {
+			return err
+		}
+		defer f.Close()
+
+		labels, err = readPrometheusLabels(f, b.numMetrics)
+		if err != nil {
+			return err
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+
+	var total uint64
+
+	dur, err := measureTime("ingestScrapes", func() error {
+		b.startProfiling()
+		total, err = b.ingestScrapes(labels, 3000)
+		if err != nil {
+			return err
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+
+	fmt.Println(" > total samples:", total)
+	fmt.Println(" > samples/sec:", float64(total)/dur.Seconds())
+
+	_, err = measureTime("stopStorage", func() error {
+		if err := b.storage.Close(); err != nil {
+			return err
+		}
+		if err := b.stopProfiling(); err != nil {
+			return err
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+const timeDelta = 30000
+
+func (b *writeBenchmark) ingestScrapes(lbls []labels.Labels, scrapeCount int) (uint64, error) {
+	var mu sync.Mutex
+	var total uint64
+
+	for i := 0; i < scrapeCount; i += 100 {
+		var wg sync.WaitGroup
+		lbls := lbls
+		for len(lbls) > 0 {
+			l := 1000
+			if len(lbls) < 1000 {
+				l = len(lbls)
+			}
+			batch := lbls[:l]
+			lbls = lbls[l:]
+
+			wg.Add(1)
+			go func() {
+				n, err := b.ingestScrapesShard(batch, 100, int64(timeDelta*i))
+				if err != nil {
+					// exitWithError(err)
+					fmt.Println(" err", err)
+				}
+				mu.Lock()
+				total += n
+				mu.Unlock()
+				wg.Done()
+			}()
+		}
+		wg.Wait()
+	}
+	fmt.Println("ingestion completed")
+
+	return total, nil
+}
+
+func (b *writeBenchmark) ingestScrapesShard(lbls []labels.Labels, scrapeCount int, baset int64) (uint64, error) {
+	ts := baset
+
+	type sample struct {
+		labels labels.Labels
+		value  int64
+		ref    *uint64
+	}
+
+	scrape := make([]*sample, 0, len(lbls))
+
+	for _, m := range lbls {
+		scrape = append(scrape, &sample{
+			labels: m,
+			value:  123456789,
+		})
+	}
+	total := uint64(0)
+
+	for i := 0; i < scrapeCount; i++ {
+		app := b.storage.Appender()
+		ts += timeDelta
+
+		for _, s := range scrape {
+			s.value += 1000
+
+			if s.ref == nil {
+				ref, err := app.Add(s.labels, ts, float64(s.value))
+				if err != nil {
+					panic(err)
+				}
+				s.ref = &ref
+			} else if err := app.AddFast(*s.ref, ts, float64(s.value)); err != nil {
+
+				if errors.Cause(err) != tsdb.ErrNotFound {
+					panic(err)
+				}
+
+				ref, err := app.Add(s.labels, ts, float64(s.value))
+				if err != nil {
+					panic(err)
+				}
+				s.ref = &ref
+			}
+
+			total++
+		}
+		if err := app.Commit(); err != nil {
+			return total, err
+		}
+	}
+	return total, nil
+}
+
+func (b *writeBenchmark) startProfiling() error {
+	var err error
+
+	// Start CPU profiling.
+	b.cpuprof, err = os.Create(filepath.Join(b.outPath, "cpu.prof"))
+	if err != nil {
+		return fmt.Errorf("bench: could not create cpu profile: %v", err)
+	}
+	if err := pprof.StartCPUProfile(b.cpuprof); err != nil {
+		return fmt.Errorf("bench: could not start CPU profile: %v", err)
+	}
+
+	// Start memory profiling.
+	b.memprof, err = os.Create(filepath.Join(b.outPath, "mem.prof"))
+	if err != nil {
+		return fmt.Errorf("bench: could not create memory profile: %v", err)
+	}
+	runtime.MemProfileRate = 64 * 1024
+
+	// Start fatal profiling.
+	b.blockprof, err = os.Create(filepath.Join(b.outPath, "block.prof"))
+	if err != nil {
+		return fmt.Errorf("bench: could not create block profile: %v", err)
+	}
+	runtime.SetBlockProfileRate(20)
+
+	b.mtxprof, err = os.Create(filepath.Join(b.outPath, "mutex.prof"))
+	if err != nil {
+		return fmt.Errorf("bench: could not create mutex profile: %v", err)
+	}
+	runtime.SetMutexProfileFraction(20)
+	return nil
+}
+
+func (b *writeBenchmark) stopProfiling() error {
+	if b.cpuprof != nil {
+		pprof.StopCPUProfile()
+		b.cpuprof.Close()
+		b.cpuprof = nil
+	}
+	if b.memprof != nil {
+		if err := pprof.Lookup("heap").WriteTo(b.memprof, 0); err != nil {
+			return fmt.Errorf("error writing mem profile: %v", err)
+		}
+		b.memprof.Close()
+		b.memprof = nil
+	}
+	if b.blockprof != nil {
+		if err := pprof.Lookup("block").WriteTo(b.blockprof, 0); err != nil {
+			return fmt.Errorf("error writing block profile: %v", err)
+		}
+		b.blockprof.Close()
+		b.blockprof = nil
+		runtime.SetBlockProfileRate(0)
+	}
+	if b.mtxprof != nil {
+		if err := pprof.Lookup("mutex").WriteTo(b.mtxprof, 0); err != nil {
+			return fmt.Errorf("error writing mutex profile: %v", err)
+		}
+		b.mtxprof.Close()
+		b.mtxprof = nil
+		runtime.SetMutexProfileFraction(0)
+	}
+	return nil
+}
+
+func measureTime(stage string, f func() error) (time.Duration, error) {
+	fmt.Printf(">> start stage=%s\n", stage)
+	start := time.Now()
+	err := f()
+	if err != nil {
+		return 0, err
+	}
+	fmt.Printf(">> completed stage=%s duration=%s\n", stage, time.Since(start))
+	return time.Since(start), nil
+}
+
+func readPrometheusLabels(r io.Reader, n int) ([]labels.Labels, error) {
+	scanner := bufio.NewScanner(r)
+
+	var mets []labels.Labels
+	hashes := map[uint64]struct{}{}
+	i := 0
+
+	for scanner.Scan() && i < n {
+		m := make(labels.Labels, 0, 10)
+
+		r := strings.NewReplacer("\"", "", "{", "", "}", "")
+		s := r.Replace(scanner.Text())
+
+		labelChunks := strings.Split(s, ",")
+		for _, labelChunk := range labelChunks {
+			split := strings.Split(labelChunk, ":")
+			m = append(m, labels.Label{Name: split[0], Value: split[1]})
+		}
+		// Order of the k/v labels matters, don't assume we'll always receive them already sorted.
+		sort.Sort(m)
+		h := m.Hash()
+		if _, ok := hashes[h]; ok {
+			continue
+		}
+		mets = append(mets, m)
+		hashes[h] = struct{}{}
+		i++
+	}
+	return mets, nil
+}
+
+func printBlocks(blocks []tsdb.BlockReader, humanReadable *bool) {
+	tw := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
+	defer tw.Flush()
+
+	fmt.Fprintln(tw, "BLOCK ULID\tMIN TIME\tMAX TIME\tNUM SAMPLES\tNUM CHUNKS\tNUM SERIES")
+	for _, b := range blocks {
+		meta := b.Meta()
+
+		fmt.Fprintf(tw,
+			"%v\t%v\t%v\t%v\t%v\t%v\n",
+			meta.ULID,
+			getFormatedTime(meta.MinTime, humanReadable),
+			getFormatedTime(meta.MaxTime, humanReadable),
+			meta.Stats.NumSamples,
+			meta.Stats.NumChunks,
+			meta.Stats.NumSeries,
+		)
+	}
+}
+
+func getFormatedTime(timestamp int64, humanReadable *bool) string {
+	if *humanReadable {
+		return time.Unix(timestamp/1000, 0).String()
+	}
+	return strconv.FormatInt(timestamp, 10)
+}
+
+func analyzeBlock(b tsdb.BlockReader, limit int) error {
+	meta := b.Meta()
+	fmt.Printf("Block ID: %s\n", meta.ULID)
+	// Presume 1ms resolution that Prometheus uses.
+	fmt.Printf("Duration: %s\n", (time.Duration(meta.MaxTime-meta.MinTime) * 1e6).String())
+	fmt.Printf("Series: %d\n", meta.Stats.NumSeries)
+	ir, err := b.Index()
+	if err != nil {
+		return err
+	}
+	defer ir.Close()
+
+	allLabelNames, err := ir.LabelNames()
+	if err != nil {
+		return err
+	}
+	fmt.Printf("Label names: %d\n", len(allLabelNames))
+
+	type postingInfo struct {
+		key    string
+		metric uint64
+	}
+	postingInfos := []postingInfo{}
+
+	printInfo := func(postingInfos []postingInfo) {
+		sort.Slice(postingInfos, func(i, j int) bool { return postingInfos[i].metric > postingInfos[j].metric })
+
+		for i, pc := range postingInfos {
+			fmt.Printf("%d %s\n", pc.metric, pc.key)
+			if i >= limit {
+				break
+			}
+		}
+	}
+
+	labelsUncovered := map[string]uint64{}
+	labelpairsUncovered := map[string]uint64{}
+	labelpairsCount := map[string]uint64{}
+	entries := 0
+	p, err := ir.Postings("", "") // The special all key.
+	if err != nil {
+		return err
+	}
+	lbls := labels.Labels{}
+	chks := []chunks.Meta{}
+	for p.Next() {
+		if err = ir.Series(p.At(), &lbls, &chks); err != nil {
+			return err
+		}
+		// Amount of the block time range not covered by this series.
+		uncovered := uint64(meta.MaxTime-meta.MinTime) - uint64(chks[len(chks)-1].MaxTime-chks[0].MinTime)
+		for _, lbl := range lbls {
+			key := lbl.Name + "=" + lbl.Value
+			labelsUncovered[lbl.Name] += uncovered
+			labelpairsUncovered[key] += uncovered
+			labelpairsCount[key]++
+			entries++
+		}
+	}
+	if p.Err() != nil {
+		return p.Err()
+	}
+	fmt.Printf("Postings (unique label pairs): %d\n", len(labelpairsUncovered))
+	fmt.Printf("Postings entries (total label pairs): %d\n", entries)
+
+	postingInfos = postingInfos[:0]
+	for k, m := range labelpairsUncovered {
+		postingInfos = append(postingInfos, postingInfo{k, uint64(float64(m) / float64(meta.MaxTime-meta.MinTime))})
+	}
+
+	fmt.Printf("\nLabel pairs most involved in churning:\n")
+	printInfo(postingInfos)
+
+	postingInfos = postingInfos[:0]
+	for k, m := range labelsUncovered {
+		postingInfos = append(postingInfos, postingInfo{k, uint64(float64(m) / float64(meta.MaxTime-meta.MinTime))})
+	}
+
+	fmt.Printf("\nLabel names most involved in churning:\n")
+	printInfo(postingInfos)
+
+	postingInfos = postingInfos[:0]
+	for k, m := range labelpairsCount {
+		postingInfos = append(postingInfos, postingInfo{k, m})
+	}
+
+	fmt.Printf("\nMost common label pairs:\n")
+	printInfo(postingInfos)
+
+	postingInfos = postingInfos[:0]
+	for _, n := range allLabelNames {
+		values, err := ir.LabelValues(n)
+		if err != nil {
+			return err
+		}
+		var cumulativeLength uint64
+
+		for i := 0; i < values.Len(); i++ {
+			value, _ := values.At(i)
+			if err != nil {
+				return err
+			}
+			for _, str := range value {
+				cumulativeLength += uint64(len(str))
+			}
+		}
+
+		postingInfos = append(postingInfos, postingInfo{n, cumulativeLength})
+	}
+
+	fmt.Printf("\nLabel names with highest cumulative label value length:\n")
+	printInfo(postingInfos)
+
+	postingInfos = postingInfos[:0]
+	for _, n := range allLabelNames {
+		lv, err := ir.LabelValues(n)
+		if err != nil {
+			return err
+		}
+		postingInfos = append(postingInfos, postingInfo{n, uint64(lv.Len())})
+	}
+	fmt.Printf("\nHighest cardinality labels:\n")
+	printInfo(postingInfos)
+
+	postingInfos = postingInfos[:0]
+	lv, err := ir.LabelValues("__name__")
+	if err != nil {
+		return err
+	}
+	for i := 0; i < lv.Len(); i++ {
+		names, err := lv.At(i)
+		if err != nil {
+			return err
+		}
+		for _, n := range names {
+			postings, err := ir.Postings("__name__", n)
+			if err != nil {
+				return err
+			}
+			count := 0
+			for postings.Next() {
+				count++
+			}
+			if postings.Err() != nil {
+				return postings.Err()
+			}
+			postingInfos = append(postingInfos, postingInfo{n, uint64(count)})
+		}
+	}
+	fmt.Printf("\nHighest cardinality metric names:\n")
+	printInfo(postingInfos)
+	return nil
+}
+
+func dumpSamples(db *tsdb.DBReadOnly, mint, maxt int64) (err error) {
+
+	q, err := db.Querier(mint, maxt)
+	if err != nil {
+		return err
+	}
+	defer func() {
+		var merr tsdb_errors.MultiError
+		merr.Add(err)
+		merr.Add(q.Close())
+		err = merr.Err()
+	}()
+
+	ss, err := q.Select(labels.NewMustRegexpMatcher("", ".*"))
+	if err != nil {
+		return err
+	}
+
+	for ss.Next() {
+		series := ss.At()
+		labels := series.Labels()
+		it := series.Iterator()
+		for it.Next() {
+			ts, val := it.At()
+			fmt.Printf("%s %g %d\n", labels, val, ts)
+		}
+		if it.Err() != nil {
+			return ss.Err()
+		}
+	}
+
+	if ss.Err() != nil {
+		return ss.Err()
+	}
+	return nil
+}
--- a/tsdb/compact.go
+++ b/tsdb/compact.go
--- a/tsdb/compact_test.go
+++ b/tsdb/compact_test.go
--- a/tsdb/db.go
+++ b/tsdb/db.go
--- a/tsdb/db_test.go
+++ b/tsdb/db_test.go
--- a/tsdb/docs/format/README.md
+++ b/tsdb/docs/format/README.md
@ -0,0 +1,6 @@
+## TSDB format
+
+* [Index](index.md)
+* [Chunks](chunks.md)
+* [Tombstones](tombstones.md)
+* [Wal](wal.md)
--- a/tsdb/docs/format/chunks.md
+++ b/tsdb/docs/format/chunks.md
@ -0,0 +1,31 @@
+# Chunks Disk Format
+
+The following describes the format of a chunks file,
+which is created in the `chunks/` directory of a block.
+The maximum size per segment file is 512MiB.
+
+Chunks in the files are referenced from the index by uint64 composed of
+in-file offset (lower 4 bytes) and segment sequence number (upper 4 bytes).
+
+```
+┌────────────────────────────┬─────────────────────┐
+│ magic(0x85BD40DD) <4 byte> │ version(1) <1 byte> │
+├────────────────────────────┴─────────────────────┤
+│ ┌──────────────────────────────────────────────┐ │
+│ │                   Chunk 1                    │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                     ...                      │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                   Chunk N                    │ │
+│ └──────────────────────────────────────────────┘ │
+└──────────────────────────────────────────────────┘
+```
+
+
+# Chunk
+
+```
+┌───────────────┬───────────────────┬──────────────┬────────────────┐
+│ len <uvarint> │ encoding <1 byte> │ data <bytes> │ CRC32 <4 byte> │
+└───────────────┴───────────────────┴──────────────┴────────────────┘
+```
--- a/tsdb/docs/format/index.md
+++ b/tsdb/docs/format/index.md
@ -0,0 +1,251 @@
+# Index Disk Format
+
+The following describes the format of the `index` file found in each block directory.
+It is terminated by a table of contents which serves as an entry point into the index.
+
+```
+┌────────────────────────────┬─────────────────────┐
+│ magic(0xBAAAD700) <4b>     │ version(1) <1 byte> │
+├────────────────────────────┴─────────────────────┤
+│ ┌──────────────────────────────────────────────┐ │
+│ │                 Symbol Table                 │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                    Series                    │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                 Label Index 1                │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                      ...                     │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                 Label Index N                │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                   Postings 1                 │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                      ...                     │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                   Postings N                 │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │               Label Index Table              │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                 Postings Table               │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                      TOC                     │ │
+│ └──────────────────────────────────────────────┘ │
+└──────────────────────────────────────────────────┘
+```
+
+When the index is written, an arbitrary number of padding bytes may be added between the lined out main sections above. When sequentially scanning through the file, any zero bytes after a section's specified length must be skipped.
+
+Most of the sections described below start with a `len` field. It always specifies the number of bytes just before the trailing CRC32 checksum. The checksum is always calculated over those `len` bytes.
+
+
+### Symbol Table
+
+The symbol table holds a sorted list of deduplicated strings that occurred in label pairs of the stored series. They can be referenced from subsequent sections and significantly reduce the total index size.
+
+The section contains a sequence of the string entries, each prefixed with the string's length in raw bytes. All strings are utf-8 encoded.
+Strings are referenced by sequential indexing. The strings are sorted in lexicographically ascending order.
+
+```
+┌────────────────────┬─────────────────────┐
+│ len <4b>           │ #symbols <4b>       │
+├────────────────────┴─────────────────────┤
+│ ┌──────────────────────┬───────────────┐ │
+│ │ len(str_1) <uvarint> │ str_1 <bytes> │ │
+│ ├──────────────────────┴───────────────┤ │
+│ │                . . .                 │ │
+│ ├──────────────────────┬───────────────┤ │
+│ │ len(str_n) <uvarint> │ str_n <bytes> │ │
+│ └──────────────────────┴───────────────┘ │
+├──────────────────────────────────────────┤
+│ CRC32 <4b>                               │
+└──────────────────────────────────────────┘
+```
+
+
+### Series
+
+The section contains a sequence of series that hold the label set of the series as well as its chunks within the block. The series are sorted lexicographically by their label sets.  
+Each series section is aligned to 16 bytes. The ID for a series is the `offset/16`. This serves as the series' ID in all subsequent references. Thereby, a sorted list of series IDs implies a lexicographically sorted list of series label sets. 
+
+```
+┌───────────────────────────────────────┐
+│ ┌───────────────────────────────────┐ │
+│ │   series_1                        │ │
+│ ├───────────────────────────────────┤ │
+│ │                 . . .             │ │
+│ ├───────────────────────────────────┤ │
+│ │   series_n                        │ │
+│ └───────────────────────────────────┘ │
+└───────────────────────────────────────┘
+```
+
+Every series entry first holds its number of labels, followed by tuples of symbol table references that contain the label name and value. The label pairs are lexicographically sorted.  
+After the labels, the number of indexed chunks is encoded, followed by a sequence of metadata entries containing the chunks minimum (`mint`) and maximum (`maxt`) timestamp and a reference to its position in the chunk file. The `mint` is the time of the first sample and `maxt` is the time of the last sample in the chunk. Holding the time range data in the index allows dropping chunks irrelevant to queried time ranges without accessing them directly.
+
+`mint` of the first chunk is stored, it's `maxt` is stored as a delta and the `mint` and `maxt` are encoded as deltas to the previous time for subsequent chunks. Similarly, the reference of the first chunk is stored and the next ref is stored as a delta to the previous one.
+
+```
+┌──────────────────────────────────────────────────────────────────────────┐
+│ len <uvarint>                                                            │
+├──────────────────────────────────────────────────────────────────────────┤
+│ ┌──────────────────────────────────────────────────────────────────────┐ │
+│ │                     labels count <uvarint64>                         │ │
+│ ├──────────────────────────────────────────────────────────────────────┤ │
+│ │              ┌────────────────────────────────────────────┐          │ │
+│ │              │ ref(l_i.name) <uvarint32>                  │          │ │
+│ │              ├────────────────────────────────────────────┤          │ │
+│ │              │ ref(l_i.value) <uvarint32>                 │          │ │
+│ │              └────────────────────────────────────────────┘          │ │
+│ │                             ...                                      │ │
+│ ├──────────────────────────────────────────────────────────────────────┤ │
+│ │                     chunks count <uvarint64>                         │ │
+│ ├──────────────────────────────────────────────────────────────────────┤ │
+│ │              ┌────────────────────────────────────────────┐          │ │
+│ │              │ c_0.mint <varint64>                        │          │ │
+│ │              ├────────────────────────────────────────────┤          │ │
+│ │              │ c_0.maxt - c_0.mint <uvarint64>            │          │ │
+│ │              ├────────────────────────────────────────────┤          │ │
+│ │              │ ref(c_0.data) <uvarint64>                  │          │ │
+│ │              └────────────────────────────────────────────┘          │ │
+│ │              ┌────────────────────────────────────────────┐          │ │
+│ │              │ c_i.mint - c_i-1.maxt <uvarint64>          │          │ │
+│ │              ├────────────────────────────────────────────┤          │ │
+│ │              │ c_i.maxt - c_i.mint <uvarint64>            │          │ │
+│ │              ├────────────────────────────────────────────┤          │ │
+│ │              │ ref(c_i.data) - ref(c_i-1.data) <varint64> │          │ │
+│ │              └────────────────────────────────────────────┘          │ │
+│ │                             ...                                      │ │
+│ └──────────────────────────────────────────────────────────────────────┘ │
+├──────────────────────────────────────────────────────────────────────────┤
+│ CRC32 <4b>                                                               │
+└──────────────────────────────────────────────────────────────────────────┘
+```
+
+
+
+### Label Index
+
+A label index section indexes the existing (combined) values for one or more label names.
+The `#names` field determines the number of indexed label names, followed by the total number of entries in the `#entries` field. The body holds #entries / #names tuples of symbol table references, each tuple being of #names length. The value tuples are sorted in lexicographically increasing order.
+
+```
+┌───────────────┬────────────────┬────────────────┐
+│ len <4b>      │ #names <4b>    │ #entries <4b>  │
+├───────────────┴────────────────┴────────────────┤
+│ ┌─────────────────────────────────────────────┐ │
+│ │ ref(value_0) <4b>                           │ │
+│ ├─────────────────────────────────────────────┤ │
+│ │ ...                                         │ │
+│ ├─────────────────────────────────────────────┤ │
+│ │ ref(value_n) <4b>                           │ │
+│ └─────────────────────────────────────────────┘ │
+│                      . . .                      │
+├─────────────────────────────────────────────────┤
+│ CRC32 <4b>                                      │
+└─────────────────────────────────────────────────┘
+```
+
+For instance, a single label name with 4 different values will be encoded as:
+
+```
+┌────┬───┬───┬──────────────┬──────────────┬──────────────┬──────────────┬───────┐
+│ 24 │ 1 │ 4 │ ref(value_0) | ref(value_1) | ref(value_2) | ref(value_3) | CRC32 |
+└────┴───┴───┴──────────────┴──────────────┴──────────────┴──────────────┴───────┘
+```
+
+The sequence of label index sections is finalized by a [label offset table](#label-offset-table) containing label offset entries that points to the beginning of each label index section for a given label name.
+
+### Postings
+
+Postings sections store monotonically increasing lists of series references that contain a given label pair associated with the list.
+
+```
+┌────────────────────┬────────────────────┐
+│ len <4b>           │ #entries <4b>      │
+├────────────────────┴────────────────────┤
+│ ┌─────────────────────────────────────┐ │
+│ │ ref(series_1) <4b>                  │ │
+│ ├─────────────────────────────────────┤ │
+│ │ ...                                 │ │
+│ ├─────────────────────────────────────┤ │
+│ │ ref(series_n) <4b>                  │ │
+│ └─────────────────────────────────────┘ │
+├─────────────────────────────────────────┤
+│ CRC32 <4b>                              │
+└─────────────────────────────────────────┘
+```
+
+The sequence of postings sections is finalized by a [postings offset table](#postings-offset-table) containing postings offset entries that points to the beginning of each postings section for a given label pair.
+
+### Label Offset Table
+
+A label offset table stores a sequence of label offset entries.
+Every label offset entry holds the label name and the offset to its values in the label index section.
+They are used to track label index sections. They are read into memory when an index file is loaded.
+
+```
+┌─────────────────────┬──────────────────────┐
+│ len <4b>            │ #entries <4b>        │
+├─────────────────────┴──────────────────────┤
+│ ┌────────────────────────────────────────┐ │
+│ │  n = 1 <1b>                            │ │
+│ ├──────────────────────┬─────────────────┤ │
+│ │ len(name) <uvarint>  │ name <bytes>    │ │
+│ ├──────────────────────┴─────────────────┤ │
+│ │  offset <uvarint64>                    │ │
+│ └────────────────────────────────────────┘ │
+│                    . . .                   │
+├────────────────────────────────────────────┤
+│  CRC32 <4b>                                │
+└────────────────────────────────────────────┘
+```
+
+
+### Postings Offset Table
+
+A postings offset table stores a sequence of postings offset entries.
+Every postings offset entry holds the lable name/value pair and the offset to its series list in the postings section.
+They are used to track postings sections. They are read into memory when an index file is loaded.
+
+```
+┌─────────────────────┬──────────────────────┐
+│ len <4b>            │ #entries <4b>        │
+├─────────────────────┴──────────────────────┤
+│ ┌────────────────────────────────────────┐ │
+│ │  n = 2 <1b>                            │ │
+│ ├──────────────────────┬─────────────────┤ │
+│ │ len(name) <uvarint>  │ name <bytes>    │ │
+│ ├──────────────────────┼─────────────────┤ │
+│ │ len(value) <uvarint> │ value <bytes>   │ │
+│ ├──────────────────────┴─────────────────┤ │
+│ │  offset <uvarint64>                    │ │
+│ └────────────────────────────────────────┘ │
+│                    . . .                   │
+├────────────────────────────────────────────┤
+│  CRC32 <4b>                                │
+└────────────────────────────────────────────┘
+```
+
+
+### TOC
+
+The table of contents serves as an entry point to the entire index and points to various sections in the file.
+If a reference is zero, it indicates the respective section does not exist and empty results should be returned upon lookup.
+
+```
+┌─────────────────────────────────────────┐
+│ ref(symbols) <8b>                       │
+├─────────────────────────────────────────┤
+│ ref(series) <8b>                        │
+├─────────────────────────────────────────┤
+│ ref(label indices start) <8b>           │
+├─────────────────────────────────────────┤
+│ ref(label offset table) <8b>            │
+├─────────────────────────────────────────┤
+│ ref(postings start) <8b>                │
+├─────────────────────────────────────────┤
+│ ref(postings offset table) <8b>         │
+├─────────────────────────────────────────┤
+│ CRC32 <4b>                              │
+└─────────────────────────────────────────┘
+```
--- a/tsdb/docs/format/tombstones.md
+++ b/tsdb/docs/format/tombstones.md
@ -0,0 +1,31 @@
+# Tombstones Disk Format
+
+The following describes the format of a tombstones file, which is placed
+at the top level directory of a block.
+
+The last 8 bytes specifies the offset to the start of Stones section.
+The stones section is 0 padded to a multiple of 4 for fast scans.
+
+```
+┌────────────────────────────┬─────────────────────┐
+│ magic(0x0130BA30) <4b>     │ version(1) <1 byte> │
+├────────────────────────────┴─────────────────────┤
+│ ┌──────────────────────────────────────────────┐ │
+│ │                Tombstone 1                   │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                      ...                     │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                Tombstone N                   │ │
+│ ├──────────────────────────────────────────────┤ │
+│ │                  CRC<4b>                     │ │
+│ └──────────────────────────────────────────────┘ │
+└──────────────────────────────────────────────────┘
+```
+
+# Tombstone 
+
+```
+┌────────────────┬─────────────────┬────────────────┐
+│ref <uvarint64> │ mint <varint64> │ maxt <varint64>│
+└────────────────┴─────────────────┴────────────────┘
+```
--- a/tsdb/docs/format/wal.md
+++ b/tsdb/docs/format/wal.md
@ -0,0 +1,88 @@
+# WAL Disk Format
+
+The write ahead log operates in segments that are numbered and sequential,
+e.g. `000000`, `000001`, `000002`, etc., and are limited to 128MB by default.
+A segment is written to in pages of 32KB. Only the last page of the most recent segment
+may be partial. A WAL record is an opaque byte slice that gets split up into sub-records
+should it exceed the remaining space of the current page. Records are never split across
+segment boundaries. If a single record exceeds the default segment size, a segment with
+a larger size will be created.
+The encoding of pages is largely borrowed from [LevelDB's/RocksDB's write ahead log.](https://github.com/facebook/rocksdb/wiki/Write-Ahead-Log-File-Format)
+
+Notable deviations are that the record fragment is encoded as:
+
+```
+┌───────────┬──────────┬────────────┬──────────────┐
+│ type <1b> │ len <2b> │ CRC32 <4b> │ data <bytes> │
+└───────────┴──────────┴────────────┴──────────────┘
+```
+
+The type flag has the following states:
+
+* `0`: rest of page will be empty
+* `1`: a full record encoded in a single fragment
+* `2`: first fragment of a record
+* `3`: middle fragment of a record
+* `4`: final fragment of a record
+
+## Record encoding
+
+The records written to the write ahead log are encoded as follows:
+
+### Series records
+
+Series records encode the labels that identifies a series and its unique ID.
+
+```
+┌────────────────────────────────────────────┐
+│ type = 1 <1b>                              │
+├────────────────────────────────────────────┤
+│ ┌─────────┬──────────────────────────────┐ │
+│ │ id <8b> │ n = len(labels) <uvarint>    │ │
+│ ├─────────┴────────────┬─────────────────┤ │
+│ │ len(str_1) <uvarint> │ str_1 <bytes>   │ │
+│ ├──────────────────────┴─────────────────┤ │
+│ │  ...                                   │ │
+│ ├───────────────────────┬────────────────┤ │
+│ │ len(str_2n) <uvarint> │ str_2n <bytes> │ │
+│ └───────────────────────┴────────────────┘ │
+│                  . . .                     │
+└────────────────────────────────────────────┘
+```
+
+### Sample records
+
+Sample records encode samples as a list of triples `(series_id, timestamp, value)`.
+Series reference and timestamp are encoded as deltas w.r.t the first sample.
+The first row stores the starting id and the starting timestamp.
+The first sample record begins at the second row.
+
+```
+┌──────────────────────────────────────────────────────────────────┐
+│ type = 2 <1b>                                                    │
+├──────────────────────────────────────────────────────────────────┤
+│ ┌────────────────────┬───────────────────────────┐               │
+│ │ id <8b>            │ timestamp <8b>            │               │
+│ └────────────────────┴───────────────────────────┘               │
+│ ┌────────────────────┬───────────────────────────┬─────────────┐ │
+│ │ id_delta <uvarint> │ timestamp_delta <uvarint> │ value <8b>  │ │
+│ └────────────────────┴───────────────────────────┴─────────────┘ │
+│                              . . .                               │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+### Tombstone records
+
+Tombstone records encode tombstones as a list of triples `(series_id, min_time, max_time)`
+and specify an interval for which samples of a series got deleted.
+
+```
+┌─────────────────────────────────────────────────────┐
+│ type = 3 <1b>                                       │
+├─────────────────────────────────────────────────────┤
+│ ┌─────────┬───────────────────┬───────────────────┐ │
+│ │ id <8b> │ min_time <varint> │ max_time <varint> │ │
+│ └─────────┴───────────────────┴───────────────────┘ │
+│                        . . .                        │
+└─────────────────────────────────────────────────────┘
+```
--- a/tsdb/encoding/encoding.go
+++ b/tsdb/encoding/encoding.go
@ -0,0 +1,244 @@
+// Copyright 2018 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package encoding
+
+import (
+	"encoding/binary"
+	"hash"
+	"hash/crc32"
+	"unsafe"
+
+	"github.com/pkg/errors"
+)
+
+var (
+	ErrInvalidSize     = errors.New("invalid size")
+	ErrInvalidChecksum = errors.New("invalid checksum")
+)
+
+// Encbuf is a helper type to populate a byte slice with various types.
+type Encbuf struct {
+	B []byte
+	C [binary.MaxVarintLen64]byte
+}
+
+func (e *Encbuf) Reset()      { e.B = e.B[:0] }
+func (e *Encbuf) Get() []byte { return e.B }
+func (e *Encbuf) Len() int    { return len(e.B) }
+
+func (e *Encbuf) PutString(s string) { e.B = append(e.B, s...) }
+func (e *Encbuf) PutByte(c byte)     { e.B = append(e.B, c) }
+
+func (e *Encbuf) PutBE32int(x int)      { e.PutBE32(uint32(x)) }
+func (e *Encbuf) PutUvarint32(x uint32) { e.PutUvarint64(uint64(x)) }
+func (e *Encbuf) PutBE64int64(x int64)  { e.PutBE64(uint64(x)) }
+func (e *Encbuf) PutUvarint(x int)      { e.PutUvarint64(uint64(x)) }
+
+func (e *Encbuf) PutBE32(x uint32) {
+	binary.BigEndian.PutUint32(e.C[:], x)
+	e.B = append(e.B, e.C[:4]...)
+}
+
+func (e *Encbuf) PutBE64(x uint64) {
+	binary.BigEndian.PutUint64(e.C[:], x)
+	e.B = append(e.B, e.C[:8]...)
+}
+
+func (e *Encbuf) PutUvarint64(x uint64) {
+	n := binary.PutUvarint(e.C[:], x)
+	e.B = append(e.B, e.C[:n]...)
+}
+
+func (e *Encbuf) PutVarint64(x int64) {
+	n := binary.PutVarint(e.C[:], x)
+	e.B = append(e.B, e.C[:n]...)
+}
+
+// PutUvarintStr writes a string to the buffer prefixed by its varint length (in bytes!).
+func (e *Encbuf) PutUvarintStr(s string) {
+	b := *(*[]byte)(unsafe.Pointer(&s))
+	e.PutUvarint(len(b))
+	e.PutString(s)
+}
+
+// PutHash appends a hash over the buffers current contents to the buffer.
+func (e *Encbuf) PutHash(h hash.Hash) {
+	h.Reset()
+	_, err := h.Write(e.B)
+	if err != nil {
+		panic(err) // The CRC32 implementation does not error
+	}
+	e.B = h.Sum(e.B)
+}
+
+// Decbuf provides safe methods to extract data from a byte slice. It does all
+// necessary bounds checking and advancing of the byte slice.
+// Several datums can be extracted without checking for errors. However, before using
+// any datum, the err() method must be checked.
+type Decbuf struct {
+	B []byte
+	E error
+}
+
+// NewDecbufAt returns a new decoding buffer. It expects the first 4 bytes
+// after offset to hold the big endian encoded content length, followed by the contents and the expected
+// checksum.
+func NewDecbufAt(bs ByteSlice, off int, castagnoliTable *crc32.Table) Decbuf {
+	if bs.Len() < off+4 {
+		return Decbuf{E: ErrInvalidSize}
+	}
+	b := bs.Range(off, off+4)
+	l := int(binary.BigEndian.Uint32(b))
+
+	if bs.Len() < off+4+l+4 {
+		return Decbuf{E: ErrInvalidSize}
+	}
+
+	// Load bytes holding the contents plus a CRC32 checksum.
+	b = bs.Range(off+4, off+4+l+4)
+	dec := Decbuf{B: b[:len(b)-4]}
+
+	if exp := binary.BigEndian.Uint32(b[len(b)-4:]); dec.Crc32(castagnoliTable) != exp {
+		return Decbuf{E: ErrInvalidChecksum}
+	}
+	return dec
+}
+
+// NewDecbufUvarintAt returns a new decoding buffer. It expects the first bytes
+// after offset to hold the uvarint-encoded buffers length, followed by the contents and the expected
+// checksum.
+func NewDecbufUvarintAt(bs ByteSlice, off int, castagnoliTable *crc32.Table) Decbuf {
+	// We never have to access this method at the far end of the byte slice. Thus just checking
+	// against the MaxVarintLen32 is sufficient.
+	if bs.Len() < off+binary.MaxVarintLen32 {
+		return Decbuf{E: ErrInvalidSize}
+	}
+	b := bs.Range(off, off+binary.MaxVarintLen32)
+
+	l, n := binary.Uvarint(b)
+	if n <= 0 || n > binary.MaxVarintLen32 {
+		return Decbuf{E: errors.Errorf("invalid uvarint %d", n)}
+	}
+
+	if bs.Len() < off+n+int(l)+4 {
+		return Decbuf{E: ErrInvalidSize}
+	}
+
+	// Load bytes holding the contents plus a CRC32 checksum.
+	b = bs.Range(off+n, off+n+int(l)+4)
+	dec := Decbuf{B: b[:len(b)-4]}
+
+	if dec.Crc32(castagnoliTable) != binary.BigEndian.Uint32(b[len(b)-4:]) {
+		return Decbuf{E: ErrInvalidChecksum}
+	}
+	return dec
+}
+
+func (d *Decbuf) Uvarint() int     { return int(d.Uvarint64()) }
+func (d *Decbuf) Be32int() int     { return int(d.Be32()) }
+func (d *Decbuf) Be64int64() int64 { return int64(d.Be64()) }
+
+// Crc32 returns a CRC32 checksum over the remaining bytes.
+func (d *Decbuf) Crc32(castagnoliTable *crc32.Table) uint32 {
+	return crc32.Checksum(d.B, castagnoliTable)
+}
+
+func (d *Decbuf) UvarintStr() string {
+	l := d.Uvarint64()
+	if d.E != nil {
+		return ""
+	}
+	if len(d.B) < int(l) {
+		d.E = ErrInvalidSize
+		return ""
+	}
+	s := string(d.B[:l])
+	d.B = d.B[l:]
+	return s
+}
+
+func (d *Decbuf) Varint64() int64 {
+	if d.E != nil {
+		return 0
+	}
+	x, n := binary.Varint(d.B)
+	if n < 1 {
+		d.E = ErrInvalidSize
+		return 0
+	}
+	d.B = d.B[n:]
+	return x
+}
+
+func (d *Decbuf) Uvarint64() uint64 {
+	if d.E != nil {
+		return 0
+	}
+	x, n := binary.Uvarint(d.B)
+	if n < 1 {
+		d.E = ErrInvalidSize
+		return 0
+	}
+	d.B = d.B[n:]
+	return x
+}
+
+func (d *Decbuf) Be64() uint64 {
+	if d.E != nil {
+		return 0
+	}
+	if len(d.B) < 8 {
+		d.E = ErrInvalidSize
+		return 0
+	}
+	x := binary.BigEndian.Uint64(d.B)
+	d.B = d.B[8:]
+	return x
+}
+
+func (d *Decbuf) Be32() uint32 {
+	if d.E != nil {
+		return 0
+	}
+	if len(d.B) < 4 {
+		d.E = ErrInvalidSize
+		return 0
+	}
+	x := binary.BigEndian.Uint32(d.B)
+	d.B = d.B[4:]
+	return x
+}
+
+func (d *Decbuf) Byte() byte {
+	if d.E != nil {
+		return 0
+	}
+	if len(d.B) < 1 {
+		d.E = ErrInvalidSize
+		return 0
+	}
+	x := d.B[0]
+	d.B = d.B[1:]
+	return x
+}
+
+func (d *Decbuf) Err() error  { return d.E }
+func (d *Decbuf) Len() int    { return len(d.B) }
+func (d *Decbuf) Get() []byte { return d.B }
+
+// ByteSlice abstracts a byte slice.
+type ByteSlice interface {
+	Len() int
+	Range(start, end int) []byte
+}
--- a/tsdb/errors/errors.go
+++ b/tsdb/errors/errors.go
@ -0,0 +1,62 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package errors
+
+import (
+	"bytes"
+	"fmt"
+)
+
+// The MultiError type implements the error interface, and contains the
+// Errors used to construct it.
+type MultiError []error
+
+// Returns a concatenated string of the contained errors
+func (es MultiError) Error() string {
+	var buf bytes.Buffer
+
+	if len(es) > 1 {
+		fmt.Fprintf(&buf, "%d errors: ", len(es))
+	}
+
+	for i, err := range es {
+		if i != 0 {
+			buf.WriteString("; ")
+		}
+		buf.WriteString(err.Error())
+	}
+
+	return buf.String()
+}
+
+// Add adds the error to the error list if it is not nil.
+func (es *MultiError) Add(err error) {
+	if err == nil {
+		return
+	}
+	if merr, ok := err.(MultiError); ok {
+		*es = append(*es, merr...)
+	} else {
+		*es = append(*es, err)
+	}
+}
+
+// Err returns the error list as an error or nil if it is empty.
+func (es MultiError) Err() error {
+	if len(es) == 0 {
+		return nil
+	}
+	return es
+}
--- a/tsdb/fileutil/dir_unix.go
+++ b/tsdb/fileutil/dir_unix.go
@ -0,0 +1,22 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !windows
+
+package fileutil
+
+import "os"
+
+// OpenDir opens a directory for syncing.
+func OpenDir(path string) (*os.File, error) { return os.Open(path) }
--- a/tsdb/fileutil/dir_windows.go
+++ b/tsdb/fileutil/dir_windows.go
@ -0,0 +1,46 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build windows
+
+package fileutil
+
+import (
+	"os"
+	"syscall"
+)
+
+// OpenDir opens a directory in windows with write access for syncing.
+func OpenDir(path string) (*os.File, error) {
+	fd, err := openDir(path)
+	if err != nil {
+		return nil, err
+	}
+	return os.NewFile(uintptr(fd), path), nil
+}
+
+func openDir(path string) (fd syscall.Handle, err error) {
+	if len(path) == 0 {
+		return syscall.InvalidHandle, syscall.ERROR_FILE_NOT_FOUND
+	}
+	pathp, err := syscall.UTF16PtrFromString(path)
+	if err != nil {
+		return syscall.InvalidHandle, err
+	}
+	access := uint32(syscall.GENERIC_READ | syscall.GENERIC_WRITE)
+	sharemode := uint32(syscall.FILE_SHARE_READ | syscall.FILE_SHARE_WRITE)
+	createmode := uint32(syscall.OPEN_EXISTING)
+	fl := uint32(syscall.FILE_FLAG_BACKUP_SEMANTICS)
+	return syscall.CreateFile(pathp, access, sharemode, nil, createmode, fl, 0)
+}
--- a/tsdb/fileutil/fileutil.go
+++ b/tsdb/fileutil/fileutil.go
@ -0,0 +1,159 @@
+// Copyright 2018 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fileutil provides utility methods used when dealing with the filesystem in tsdb.
+// It is largely copied from github.com/coreos/etcd/pkg/fileutil to avoid the
+// dependency chain it brings with it.
+// Please check github.com/coreos/etcd for licensing information.
+package fileutil
+
+import (
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+)
+
+// CopyDirs copies all directories, subdirectories and files recursively including the empty folders.
+// Source and destination must be full paths.
+func CopyDirs(src, dest string) error {
+	if err := os.MkdirAll(dest, 0777); err != nil {
+		return err
+	}
+	files, err := readDirs(src)
+	if err != nil {
+		return err
+	}
+
+	for _, f := range files {
+		dp := filepath.Join(dest, f)
+		sp := filepath.Join(src, f)
+
+		stat, err := os.Stat(sp)
+		if err != nil {
+			return err
+		}
+
+		// Empty directories are also created.
+		if stat.IsDir() {
+			if err := os.MkdirAll(dp, 0777); err != nil {
+				return err
+			}
+			continue
+		}
+
+		if err := copyFile(sp, dp); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func copyFile(src, dest string) error {
+	data, err := ioutil.ReadFile(src)
+	if err != nil {
+		return err
+	}
+
+	err = ioutil.WriteFile(dest, data, 0644)
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+// readDirs reads the source directory recursively and
+// returns relative paths to all files and empty directories.
+func readDirs(src string) ([]string, error) {
+	var files []string
+
+	err := filepath.Walk(src, func(path string, f os.FileInfo, err error) error {
+		relativePath := strings.TrimPrefix(path, src)
+		if len(relativePath) > 0 {
+			files = append(files, relativePath)
+		}
+		return nil
+	})
+	if err != nil {
+		return nil, err
+	}
+	return files, nil
+}
+
+// ReadDir returns the filenames in the given directory in sorted order.
+func ReadDir(dirpath string) ([]string, error) {
+	dir, err := os.Open(dirpath)
+	if err != nil {
+		return nil, err
+	}
+	defer dir.Close()
+	names, err := dir.Readdirnames(-1)
+	if err != nil {
+		return nil, err
+	}
+	sort.Strings(names)
+	return names, nil
+}
+
+// Rename safely renames a file.
+func Rename(from, to string) error {
+	if err := os.Rename(from, to); err != nil {
+		return err
+	}
+
+	// Directory was renamed; sync parent dir to persist rename.
+	pdir, err := OpenDir(filepath.Dir(to))
+	if err != nil {
+		return err
+	}
+
+	if err = pdir.Sync(); err != nil {
+		pdir.Close()
+		return err
+	}
+	return pdir.Close()
+}
+
+// Replace moves a file or directory to a new location and deletes any previous data.
+// It is not atomic.
+func Replace(from, to string) error {
+	// Remove destination only if it is a dir otherwise leave it to os.Rename
+	// as it replaces the destination file and is atomic.
+	{
+		f, err := os.Stat(to)
+		if !os.IsNotExist(err) {
+			if err == nil && f.IsDir() {
+				if err := os.RemoveAll(to); err != nil {
+					return err
+				}
+			}
+		}
+	}
+
+	if err := os.Rename(from, to); err != nil {
+		return err
+	}
+
+	// Directory was renamed; sync parent dir to persist rename.
+	pdir, err := OpenDir(filepath.Dir(to))
+	if err != nil {
+		return err
+	}
+
+	if err = pdir.Sync(); err != nil {
+		pdir.Close()
+		return err
+	}
+	return pdir.Close()
+}
--- a/tsdb/fileutil/flock.go
+++ b/tsdb/fileutil/flock.go
@ -0,0 +1,41 @@
+// Copyright 2016 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fileutil
+
+import (
+	"os"
+	"path/filepath"
+)
+
+// Releaser provides the Release method to release a file lock.
+type Releaser interface {
+	Release() error
+}
+
+// Flock locks the file with the provided name. If the file does not exist, it is
+// created. The returned Releaser is used to release the lock. existed is true
+// if the file to lock already existed. A non-nil error is returned if the
+// locking has failed. Neither this function nor the returned Releaser is
+// goroutine-safe.
+func Flock(fileName string) (r Releaser, existed bool, err error) {
+	if err = os.MkdirAll(filepath.Dir(fileName), 0755); err != nil {
+		return nil, false, err
+	}
+
+	_, err = os.Stat(fileName)
+	existed = err == nil
+
+	r, err = newLock(fileName)
+	return r, existed, err
+}
--- a/tsdb/fileutil/flock_plan9.go
+++ b/tsdb/fileutil/flock_plan9.go
@ -0,0 +1,32 @@
+// Copyright 2016 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fileutil
+
+import "os"
+
+type plan9Lock struct {
+	f *os.File
+}
+
+func (l *plan9Lock) Release() error {
+	return l.f.Close()
+}
+
+func newLock(fileName string) (Releaser, error) {
+	f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, os.ModeExclusive|0644)
+	if err != nil {
+		return nil, err
+	}
+	return &plan9Lock{f}, nil
+}
--- a/tsdb/fileutil/flock_solaris.go
+++ b/tsdb/fileutil/flock_solaris.go
@ -0,0 +1,59 @@
+// Copyright 2016 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build solaris
+
+package fileutil
+
+import (
+	"os"
+	"syscall"
+)
+
+type unixLock struct {
+	f *os.File
+}
+
+func (l *unixLock) Release() error {
+	if err := l.set(false); err != nil {
+		return err
+	}
+	return l.f.Close()
+}
+
+func (l *unixLock) set(lock bool) error {
+	flock := syscall.Flock_t{
+		Type:   syscall.F_UNLCK,
+		Start:  0,
+		Len:    0,
+		Whence: 1,
+	}
+	if lock {
+		flock.Type = syscall.F_WRLCK
+	}
+	return syscall.FcntlFlock(l.f.Fd(), syscall.F_SETLK, &flock)
+}
+
+func newLock(fileName string) (Releaser, error) {
+	f, err := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE, 0644)
+	if err != nil {
+		return nil, err
+	}
+	l := &unixLock{f}
+	err = l.set(true)
+	if err != nil {
+		f.Close()
+		return nil, err
+	}
+	return l, nil
+}
--- a/tsdb/fileutil/flock_test.go
+++ b/tsdb/fileutil/flock_test.go
@ -0,0 +1,80 @@
+// Copyright 2016 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fileutil
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/prometheus/tsdb/testutil"
+)
+
+func TestLocking(t *testing.T) {
+	dir := testutil.NewTemporaryDirectory("test_flock", t)
+	defer dir.Close()
+
+	fileName := filepath.Join(dir.Path(), "LOCK")
+
+	if _, err := os.Stat(fileName); err == nil {
+		t.Fatalf("File %q unexpectedly exists.", fileName)
+	}
+
+	lock, existed, err := Flock(fileName)
+	if err != nil {
+		t.Fatalf("Error locking file %q: %s", fileName, err)
+	}
+	if existed {
+		t.Errorf("File %q reported as existing during locking.", fileName)
+	}
+
+	// File must now exist.
+	if _, err = os.Stat(fileName); err != nil {
+		t.Errorf("Could not stat file %q expected to exist: %s", fileName, err)
+	}
+
+	// Try to lock again.
+	lockedAgain, existed, err := Flock(fileName)
+	if err == nil {
+		t.Fatalf("File %q locked twice.", fileName)
+	}
+	if lockedAgain != nil {
+		t.Error("Unsuccessful locking did not return nil.")
+	}
+	if !existed {
+		t.Errorf("Existing file %q not recognized.", fileName)
+	}
+
+	if err := lock.Release(); err != nil {
+		t.Errorf("Error releasing lock for file %q: %s", fileName, err)
+	}
+
+	// File must still exist.
+	if _, err = os.Stat(fileName); err != nil {
+		t.Errorf("Could not stat file %q expected to exist: %s", fileName, err)
+	}
+
+	// Lock existing file.
+	lock, existed, err = Flock(fileName)
+	if err != nil {
+		t.Fatalf("Error locking file %q: %s", fileName, err)
+	}
+	if !existed {
+		t.Errorf("Existing file %q not recognized.", fileName)
+	}
+
+	if err := lock.Release(); err != nil {
+		t.Errorf("Error releasing lock for file %q: %s", fileName, err)
+	}
+}
--- a/tsdb/fileutil/flock_unix.go
+++ b/tsdb/fileutil/flock_unix.go
@ -0,0 +1,54 @@
+// Copyright 2016 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build darwin dragonfly freebsd linux netbsd openbsd
+
+package fileutil
+
+import (
+	"os"
+	"syscall"
+)
+
+type unixLock struct {
+	f *os.File
+}
+
+func (l *unixLock) Release() error {
+	if err := l.set(false); err != nil {
+		return err
+	}
+	return l.f.Close()
+}
+
+func (l *unixLock) set(lock bool) error {
+	how := syscall.LOCK_UN
+	if lock {
+		how = syscall.LOCK_EX
+	}
+	return syscall.Flock(int(l.f.Fd()), how|syscall.LOCK_NB)
+}
+
+func newLock(fileName string) (Releaser, error) {
+	f, err := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE, 0644)
+	if err != nil {
+		return nil, err
+	}
+	l := &unixLock{f}
+	err = l.set(true)
+	if err != nil {
+		f.Close()
+		return nil, err
+	}
+	return l, nil
+}
--- a/tsdb/fileutil/flock_windows.go
+++ b/tsdb/fileutil/flock_windows.go
@ -0,0 +1,36 @@
+// Copyright 2016 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fileutil
+
+import "syscall"
+
+type windowsLock struct {
+	fd syscall.Handle
+}
+
+func (fl *windowsLock) Release() error {
+	return syscall.Close(fl.fd)
+}
+
+func newLock(fileName string) (Releaser, error) {
+	pathp, err := syscall.UTF16PtrFromString(fileName)
+	if err != nil {
+		return nil, err
+	}
+	fd, err := syscall.CreateFile(pathp, syscall.GENERIC_READ|syscall.GENERIC_WRITE, 0, nil, syscall.CREATE_ALWAYS, syscall.FILE_ATTRIBUTE_NORMAL, 0)
+	if err != nil {
+		return nil, err
+	}
+	return &windowsLock{fd}, nil
+}
--- a/tsdb/fileutil/mmap.go
+++ b/tsdb/fileutil/mmap.go
@ -0,0 +1,61 @@
+// Copyright 2018 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fileutil
+
+import (
+	"os"
+
+	"github.com/pkg/errors"
+)
+
+type MmapFile struct {
+	f *os.File
+	b []byte
+}
+
+func OpenMmapFile(path string) (*MmapFile, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, errors.Wrap(err, "try lock file")
+	}
+	info, err := f.Stat()
+	if err != nil {
+		return nil, errors.Wrap(err, "stat")
+	}
+
+	b, err := mmap(f, int(info.Size()))
+	if err != nil {
+		return nil, errors.Wrap(err, "mmap")
+	}
+
+	return &MmapFile{f: f, b: b}, nil
+}
+
+func (f *MmapFile) Close() error {
+	err0 := munmap(f.b)
+	err1 := f.f.Close()
+
+	if err0 != nil {
+		return err0
+	}
+	return err1
+}
+
+func (f *MmapFile) File() *os.File {
+	return f.f
+}
+
+func (f *MmapFile) Bytes() []byte {
+	return f.b
+}
--- a/tsdb/fileutil/mmap_386.go
+++ b/tsdb/fileutil/mmap_386.go
@ -0,0 +1,18 @@
+// Copyright 2018 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build windows
+
+package fileutil
+
+const maxMapSize = 0x7FFFFFFF // 2GB
--- a/tsdb/fileutil/mmap_amd64.go
+++ b/tsdb/fileutil/mmap_amd64.go
@ -0,0 +1,18 @@
+// Copyright 2018 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build windows
+
+package fileutil
+
+const maxMapSize = 0xFFFFFFFFFFFF // 256TB
--- a/tsdb/fileutil/mmap_unix.go
+++ b/tsdb/fileutil/mmap_unix.go
@ -0,0 +1,30 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !windows,!plan9
+
+package fileutil
+
+import (
+	"os"
+
+	"golang.org/x/sys/unix"
+)
+
+func mmap(f *os.File, length int) ([]byte, error) {
+	return unix.Mmap(int(f.Fd()), 0, length, unix.PROT_READ, unix.MAP_SHARED)
+}
+
+func munmap(b []byte) (err error) {
+	return unix.Munmap(b)
+}
--- a/tsdb/fileutil/mmap_windows.go
+++ b/tsdb/fileutil/mmap_windows.go
@ -0,0 +1,46 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fileutil
+
+import (
+	"os"
+	"syscall"
+	"unsafe"
+)
+
+func mmap(f *os.File, size int) ([]byte, error) {
+	low, high := uint32(size), uint32(size>>32)
+	h, errno := syscall.CreateFileMapping(syscall.Handle(f.Fd()), nil, syscall.PAGE_READONLY, high, low, nil)
+	if h == 0 {
+		return nil, os.NewSyscallError("CreateFileMapping", errno)
+	}
+
+	addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(size))
+	if addr == 0 {
+		return nil, os.NewSyscallError("MapViewOfFile", errno)
+	}
+
+	if err := syscall.CloseHandle(syscall.Handle(h)); err != nil {
+		return nil, os.NewSyscallError("CloseHandle", err)
+	}
+
+	return (*[maxMapSize]byte)(unsafe.Pointer(addr))[:size], nil
+}
+
+func munmap(b []byte) error {
+	if err := syscall.UnmapViewOfFile((uintptr)(unsafe.Pointer(&b[0]))); err != nil {
+		return os.NewSyscallError("UnmapViewOfFile", err)
+	}
+	return nil
+}
--- a/tsdb/fileutil/preallocate.go
+++ b/tsdb/fileutil/preallocate.go
@ -0,0 +1,54 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fileutil
+
+import (
+	"io"
+	"os"
+)
+
+// Preallocate tries to allocate the space for given
+// file. This operation is only supported on linux by a
+// few filesystems (btrfs, ext4, etc.).
+// If the operation is unsupported, no error will be returned.
+// Otherwise, the error encountered will be returned.
+func Preallocate(f *os.File, sizeInBytes int64, extendFile bool) error {
+	if sizeInBytes == 0 {
+		// fallocate will return EINVAL if length is 0; skip
+		return nil
+	}
+	if extendFile {
+		return preallocExtend(f, sizeInBytes)
+	}
+	return preallocFixed(f, sizeInBytes)
+}
+
+func preallocExtendTrunc(f *os.File, sizeInBytes int64) error {
+	curOff, err := f.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return err
+	}
+	size, err := f.Seek(sizeInBytes, io.SeekEnd)
+	if err != nil {
+		return err
+	}
+	if _, err = f.Seek(curOff, io.SeekStart); err != nil {
+		return err
+	}
+	if sizeInBytes > size {
+		return nil
+	}
+	return f.Truncate(sizeInBytes)
+}
--- a/tsdb/fileutil/preallocate_darwin.go
+++ b/tsdb/fileutil/preallocate_darwin.go
@ -0,0 +1,41 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fileutil
+
+import (
+	"os"
+	"syscall"
+	"unsafe"
+)
+
+func preallocExtend(f *os.File, sizeInBytes int64) error {
+	if err := preallocFixed(f, sizeInBytes); err != nil {
+		return err
+	}
+	return preallocExtendTrunc(f, sizeInBytes)
+}
+
+func preallocFixed(f *os.File, sizeInBytes int64) error {
+	fstore := &syscall.Fstore_t{
+		Flags:   syscall.F_ALLOCATEALL,
+		Posmode: syscall.F_PEOFPOSMODE,
+		Length:  sizeInBytes}
+	p := unsafe.Pointer(fstore)
+	_, _, errno := syscall.Syscall(syscall.SYS_FCNTL, f.Fd(), uintptr(syscall.F_PREALLOCATE), uintptr(p))
+	if errno == 0 || errno == syscall.ENOTSUP {
+		return nil
+	}
+	return errno
+}
--- a/tsdb/fileutil/preallocate_linux.go
+++ b/tsdb/fileutil/preallocate_linux.go
@ -0,0 +1,47 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fileutil
+
+import (
+	"os"
+	"syscall"
+)
+
+func preallocExtend(f *os.File, sizeInBytes int64) error {
+	// use mode = 0 to change size
+	err := syscall.Fallocate(int(f.Fd()), 0, 0, sizeInBytes)
+	if err != nil {
+		errno, ok := err.(syscall.Errno)
+		// not supported; fallback
+		// fallocate EINTRs frequently in some environments; fallback
+		if ok && (errno == syscall.ENOTSUP || errno == syscall.EINTR) {
+			return preallocExtendTrunc(f, sizeInBytes)
+		}
+	}
+	return err
+}
+
+func preallocFixed(f *os.File, sizeInBytes int64) error {
+	// use mode = 1 to keep size; see FALLOC_FL_KEEP_SIZE
+	err := syscall.Fallocate(int(f.Fd()), 1, 0, sizeInBytes)
+	if err != nil {
+		errno, ok := err.(syscall.Errno)
+		// treat not supported as nil error
+		if ok && errno == syscall.ENOTSUP {
+			return nil
+		}
+	}
+	return err
+}
--- a/tsdb/fileutil/preallocate_other.go
+++ b/tsdb/fileutil/preallocate_other.go
@ -0,0 +1,25 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !linux,!darwin
+
+package fileutil
+
+import "os"
+
+func preallocExtend(f *os.File, sizeInBytes int64) error {
+	return preallocExtendTrunc(f, sizeInBytes)
+}
+
+func preallocFixed(f *os.File, sizeInBytes int64) error { return nil }
--- a/tsdb/fileutil/sync.go
+++ b/tsdb/fileutil/sync.go
@ -0,0 +1,24 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !linux,!darwin
+
+package fileutil
+
+import "os"
+
+// Fdatasync is a wrapper around file.Sync(). Special handling is needed on linux platform.
+func Fdatasync(f *os.File) error {
+	return f.Sync()
+}
--- a/tsdb/fileutil/sync_darwin.go
+++ b/tsdb/fileutil/sync_darwin.go
@ -0,0 +1,27 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build darwin
+
+package fileutil
+
+import (
+	"os"
+)
+
+// Fdatasync on darwin platform invokes fcntl(F_FULLFSYNC) for actual persistence
+// on physical drive media.
+func Fdatasync(f *os.File) error {
+	return f.Sync()
+}
--- a/tsdb/fileutil/sync_linux.go
+++ b/tsdb/fileutil/sync_linux.go
@ -0,0 +1,29 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package fileutil
+
+import (
+	"os"
+	"syscall"
+)
+
+// Fdatasync is similar to fsync(), but does not flush modified metadata
+// unless that metadata is needed in order to allow a subsequent data retrieval
+// to be correctly handled.
+func Fdatasync(f *os.File) error {
+	return syscall.Fdatasync(int(f.Fd()))
+}
--- a/tsdb/go.mod
+++ b/tsdb/go.mod
@ -0,0 +1,14 @@
+module github.com/prometheus/tsdb
+
+require (
+	github.com/cespare/xxhash v1.1.0
+	github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954
+	github.com/go-kit/kit v0.8.0
+	github.com/golang/snappy v0.0.1
+	github.com/oklog/ulid v1.3.1
+	github.com/pkg/errors v0.8.0
+	github.com/prometheus/client_golang v1.0.0
+	golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4
+	golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5
+	gopkg.in/alecthomas/kingpin.v2 v2.2.6
+)
--- a/tsdb/go.sum
+++ b/tsdb/go.sum
@ -0,0 +1,83 @@
+github.com/OneOfOne/xxhash v1.2.2 h1:KMrpdQIwFcEqXDklaen+P1axHaj9BSKzvpUUfnHldSE=
+github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
+github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc h1:cAKDfWh5VpdgMhJosfJnn5/FoN2SRZ4p7fJNX58YPaU=
+github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
+github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf h1:qet1QNfXsQxTZqLG4oE62mJzwPIB8+Tee4RNCL9ulrY=
+github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
+github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973 h1:xJ4a3vCFaGF/jqvzLMYoU8P317H5OQ+Via4RmuPwCS0=
+github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
+github.com/beorn7/perks v1.0.0 h1:HWo1m869IqiPhD389kmkxeTalrjNbbJTC8LXupb+sl0=
+github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
+github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko=
+github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954 h1:RMLoZVzv4GliuWafOuPuQDKSm1SJph7uCRnnS61JAn4=
+github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
+github.com/go-kit/kit v0.8.0 h1:Wz+5lgoB0kkuqLEc6NVmwRknTKP6dTGbSqvhZtBI/j0=
+github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
+github.com/go-logfmt/logfmt v0.3.0 h1:8HUsc87TaSWLKwrnumgC8/YconD2fJQsRJAsWaPg2ic=
+github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
+github.com/go-stack/stack v1.8.0 h1:5SgMzNM5HxrEjV0ww2lTmX6E2Izsfxas4+YHWRs3Lsk=
+github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
+github.com/gogo/protobuf v1.1.1 h1:72R+M5VuhED/KujmZVcIquuo8mBgX4oVda//DQb3PXo=
+github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
+github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
+github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
+github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
+github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
+github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
+github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515 h1:T+h1c/A9Gawja4Y9mFVWj2vyii2bbUNDw3kt9VxK2EY=
+github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
+github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU=
+github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
+github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
+github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4=
+github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
+github.com/pkg/errors v0.8.0 h1:WdK/asTD0HN+q6hsWO3/vpuAkAr+tw6aNJNDFFf0+qw=
+github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_golang v0.9.1 h1:K47Rk0v/fkEfwfQet2KWhscE0cJzjgCCDBG2KHZoVno=
+github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
+github.com/prometheus/client_golang v1.0.0 h1:vrDKnkGzuGvhNAL56c7DBz29ZL+KxnoR0x7enabFceM=
+github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
+github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910 h1:idejC8f05m9MGOsuEi1ATq9shN03HrxNkD/luQvxCv8=
+github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
+github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90 h1:S/YWwWx/RA8rT8tKFRuGUZhuA90OyIBpPCXkcbwU8DE=
+github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/prometheus/common v0.4.1 h1:K0MGApIoQvMw27RTdJkPbr3JZ7DNbtxQNyi5STVM6Kw=
+github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
+github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d h1:GoAlyOgbOEIFdaDqxJVlbOQ1DtGmZWs/Qau0hIlk+WQ=
+github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
+github.com/prometheus/procfs v0.0.2 h1:6LJUbpNm42llc4HRCuvApCSWB/WfhuNo9K98Q9sNGfs=
+github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
+github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
+github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72 h1:qLC7fQah7D6K1B0ujays3HV9gkFtllcxhzImRR7ArPQ=
+github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w=
+github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
+github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
+golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/sync v0.0.0-20181108010431-42b317875d0f h1:Bl/8QSvNqXvPGPGXa2z5xUTmV7VDcZyvRZ+QQXkXTZQ=
+golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 h1:YUO/7uOKsKeq9UokNS62b8FYywz3ker1l1vDZRCRefw=
+golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5 h1:mzjBh+S5frKOsOBobWIMAbXavqjmgO17k/2puhcFR94=
+golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+gopkg.in/alecthomas/kingpin.v2 v2.2.6 h1:jMFz6MfLP0/4fUyZle81rXUoxOBFi19VUFKVDOQfozc=
+gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
--- a/tsdb/goversion/goversio_test.go
+++ b/tsdb/goversion/goversio_test.go
@ -0,0 +1,27 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package goversion_test
+
+import (
+	"testing"
+
+	_ "github.com/prometheus/tsdb/goversion"
+)
+
+// This test is is intentionally blank and exists only so `go test` believes
+// there is something to test.
+//
+// The blank import above is actually what invokes the test of this package. If
+// the import succeeds (the code compiles), the test passed.
+func Test(t *testing.T) {}
--- a/tsdb/goversion/goversion.go
+++ b/tsdb/goversion/goversion.go
@ -0,0 +1,19 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.12
+
+// Package goversion enforces the go version suported by the tsdb module.
+package goversion
+
+const _SoftwareRequiresGOVERSION1_12 = uint8(0)
--- a/tsdb/goversion/init.go
+++ b/tsdb/goversion/init.go
@ -0,0 +1,17 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package goversion
+
+// This will fail to compile if the Go runtime version isn't >= 1.12.
+var _ = _SoftwareRequiresGOVERSION1_12
--- a/tsdb/head.go
+++ b/tsdb/head.go
--- a/tsdb/head_bench_test.go
+++ b/tsdb/head_bench_test.go
@ -0,0 +1,120 @@
+// Copyright 2018 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"strconv"
+	"sync/atomic"
+	"testing"
+
+	"github.com/prometheus/tsdb/labels"
+	"github.com/prometheus/tsdb/testutil"
+)
+
+func BenchmarkHeadStripeSeriesCreate(b *testing.B) {
+	// Put a series, select it. GC it and then access it.
+	h, err := NewHead(nil, nil, nil, 1000)
+	testutil.Ok(b, err)
+	defer h.Close()
+
+	for i := 0; i < b.N; i++ {
+		h.getOrCreate(uint64(i), labels.FromStrings("a", strconv.Itoa(i)))
+	}
+}
+
+func BenchmarkHeadStripeSeriesCreateParallel(b *testing.B) {
+	// Put a series, select it. GC it and then access it.
+	h, err := NewHead(nil, nil, nil, 1000)
+	testutil.Ok(b, err)
+	defer h.Close()
+
+	var count int64
+
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			i := atomic.AddInt64(&count, 1)
+			h.getOrCreate(uint64(i), labels.FromStrings("a", strconv.Itoa(int(i))))
+		}
+	})
+}
+
+func BenchmarkHeadPostingForMatchers(b *testing.B) {
+	h, err := NewHead(nil, nil, nil, 1000)
+	testutil.Ok(b, err)
+	defer func() {
+		testutil.Ok(b, h.Close())
+	}()
+
+	var ref uint64
+
+	addSeries := func(l labels.Labels) {
+		ref++
+		h.getOrCreateWithID(ref, l.Hash(), l)
+	}
+
+	for n := 0; n < 10; n++ {
+		for i := 0; i < 100000; i++ {
+			addSeries(labels.FromStrings("i", strconv.Itoa(i), "n", strconv.Itoa(n), "j", "foo"))
+			// Have some series that won't be matched, to properly test inverted matches.
+			addSeries(labels.FromStrings("i", strconv.Itoa(i), "n", strconv.Itoa(n), "j", "bar"))
+			addSeries(labels.FromStrings("i", strconv.Itoa(i), "n", "0_"+strconv.Itoa(n), "j", "bar"))
+			addSeries(labels.FromStrings("i", strconv.Itoa(i), "n", "1_"+strconv.Itoa(n), "j", "bar"))
+			addSeries(labels.FromStrings("i", strconv.Itoa(i), "n", "2_"+strconv.Itoa(n), "j", "foo"))
+		}
+	}
+
+	n1 := labels.NewEqualMatcher("n", "1")
+
+	jFoo := labels.NewEqualMatcher("j", "foo")
+	jNotFoo := labels.Not(jFoo)
+
+	iStar := labels.NewMustRegexpMatcher("i", "^.*$")
+	iPlus := labels.NewMustRegexpMatcher("i", "^.+$")
+	i1Plus := labels.NewMustRegexpMatcher("i", "^1.+$")
+	iEmptyRe := labels.NewMustRegexpMatcher("i", "^$")
+	iNotEmpty := labels.Not(labels.NewEqualMatcher("i", ""))
+	iNot2 := labels.Not(labels.NewEqualMatcher("n", "2"))
+	iNot2Star := labels.Not(labels.NewMustRegexpMatcher("i", "^2.*$"))
+
+	cases := []struct {
+		name     string
+		matchers []labels.Matcher
+	}{
+		{`n="1"`, []labels.Matcher{n1}},
+		{`n="1",j="foo"`, []labels.Matcher{n1, jFoo}},
+		{`j="foo",n="1"`, []labels.Matcher{jFoo, n1}},
+		{`n="1",j!="foo"`, []labels.Matcher{n1, jNotFoo}},
+		{`i=~".*"`, []labels.Matcher{iStar}},
+		{`i=~".+"`, []labels.Matcher{iPlus}},
+		{`i=~""`, []labels.Matcher{iEmptyRe}},
+		{`i!=""`, []labels.Matcher{iNotEmpty}},
+		{`n="1",i=~".*",j="foo"`, []labels.Matcher{n1, iStar, jFoo}},
+		{`n="1",i=~".*",i!="2",j="foo"`, []labels.Matcher{n1, iStar, iNot2, jFoo}},
+		{`n="1",i!=""`, []labels.Matcher{n1, iNotEmpty}},
+		{`n="1",i!="",j="foo"`, []labels.Matcher{n1, iNotEmpty, jFoo}},
+		{`n="1",i=~".+",j="foo"`, []labels.Matcher{n1, iPlus, jFoo}},
+		{`n="1",i=~"1.+",j="foo"`, []labels.Matcher{n1, i1Plus, jFoo}},
+		{`n="1",i=~".+",i!="2",j="foo"`, []labels.Matcher{n1, iPlus, iNot2, jFoo}},
+		{`n="1",i=~".+",i!~"2.*",j="foo"`, []labels.Matcher{n1, iPlus, iNot2Star, jFoo}},
+	}
+
+	for _, c := range cases {
+		b.Run(c.name, func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				_, err := PostingsForMatchers(h.indexRange(0, 1000), c.matchers...)
+				testutil.Ok(b, err)
+			}
+		})
+	}
+}
--- a/tsdb/head_test.go
+++ b/tsdb/head_test.go
--- a/tsdb/index/index.go
+++ b/tsdb/index/index.go
--- a/tsdb/index/index_test.go
+++ b/tsdb/index/index_test.go
@ -0,0 +1,429 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package index
+
+import (
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"path/filepath"
+	"sort"
+	"testing"
+
+	"github.com/pkg/errors"
+	"github.com/prometheus/tsdb/chunkenc"
+	"github.com/prometheus/tsdb/chunks"
+	"github.com/prometheus/tsdb/encoding"
+	"github.com/prometheus/tsdb/labels"
+	"github.com/prometheus/tsdb/testutil"
+)
+
+type series struct {
+	l      labels.Labels
+	chunks []chunks.Meta
+}
+
+type mockIndex struct {
+	series     map[uint64]series
+	labelIndex map[string][]string
+	postings   map[labels.Label][]uint64
+	symbols    map[string]struct{}
+}
+
+func newMockIndex() mockIndex {
+	ix := mockIndex{
+		series:     make(map[uint64]series),
+		labelIndex: make(map[string][]string),
+		postings:   make(map[labels.Label][]uint64),
+		symbols:    make(map[string]struct{}),
+	}
+	return ix
+}
+
+func (m mockIndex) Symbols() (map[string]struct{}, error) {
+	return m.symbols, nil
+}
+
+func (m mockIndex) AddSeries(ref uint64, l labels.Labels, chunks ...chunks.Meta) error {
+	if _, ok := m.series[ref]; ok {
+		return errors.Errorf("series with reference %d already added", ref)
+	}
+	for _, lbl := range l {
+		m.symbols[lbl.Name] = struct{}{}
+		m.symbols[lbl.Value] = struct{}{}
+	}
+
+	s := series{l: l}
+	// Actual chunk data is not stored in the index.
+	for _, c := range chunks {
+		c.Chunk = nil
+		s.chunks = append(s.chunks, c)
+	}
+	m.series[ref] = s
+
+	return nil
+}
+
+func (m mockIndex) WriteLabelIndex(names []string, values []string) error {
+	// TODO support composite indexes
+	if len(names) != 1 {
+		return errors.New("composite indexes not supported yet")
+	}
+	sort.Strings(values)
+	m.labelIndex[names[0]] = values
+	return nil
+}
+
+func (m mockIndex) WritePostings(name, value string, it Postings) error {
+	l := labels.Label{Name: name, Value: value}
+	if _, ok := m.postings[l]; ok {
+		return errors.Errorf("postings for %s already added", l)
+	}
+	ep, err := ExpandPostings(it)
+	if err != nil {
+		return err
+	}
+	m.postings[l] = ep
+	return nil
+}
+
+func (m mockIndex) Close() error {
+	return nil
+}
+
+func (m mockIndex) LabelValues(names ...string) (StringTuples, error) {
+	// TODO support composite indexes
+	if len(names) != 1 {
+		return nil, errors.New("composite indexes not supported yet")
+	}
+
+	return NewStringTuples(m.labelIndex[names[0]], 1)
+}
+
+func (m mockIndex) Postings(name, value string) (Postings, error) {
+	l := labels.Label{Name: name, Value: value}
+	return NewListPostings(m.postings[l]), nil
+}
+
+func (m mockIndex) SortedPostings(p Postings) Postings {
+	ep, err := ExpandPostings(p)
+	if err != nil {
+		return ErrPostings(errors.Wrap(err, "expand postings"))
+	}
+
+	sort.Slice(ep, func(i, j int) bool {
+		return labels.Compare(m.series[ep[i]].l, m.series[ep[j]].l) < 0
+	})
+	return NewListPostings(ep)
+}
+
+func (m mockIndex) Series(ref uint64, lset *labels.Labels, chks *[]chunks.Meta) error {
+	s, ok := m.series[ref]
+	if !ok {
+		return errors.New("not found")
+	}
+	*lset = append((*lset)[:0], s.l...)
+	*chks = append((*chks)[:0], s.chunks...)
+
+	return nil
+}
+
+func (m mockIndex) LabelIndices() ([][]string, error) {
+	res := make([][]string, 0, len(m.labelIndex))
+	for k := range m.labelIndex {
+		res = append(res, []string{k})
+	}
+	return res, nil
+}
+
+func TestIndexRW_Create_Open(t *testing.T) {
+	dir, err := ioutil.TempDir("", "test_index_create")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	fn := filepath.Join(dir, indexFilename)
+
+	// An empty index must still result in a readable file.
+	iw, err := NewWriter(fn)
+	testutil.Ok(t, err)
+	testutil.Ok(t, iw.Close())
+
+	ir, err := NewFileReader(fn)
+	testutil.Ok(t, err)
+	testutil.Ok(t, ir.Close())
+
+	// Modify magic header must cause open to fail.
+	f, err := os.OpenFile(fn, os.O_WRONLY, 0666)
+	testutil.Ok(t, err)
+	_, err = f.WriteAt([]byte{0, 0}, 0)
+	testutil.Ok(t, err)
+	f.Close()
+
+	_, err = NewFileReader(dir)
+	testutil.NotOk(t, err)
+}
+
+func TestIndexRW_Postings(t *testing.T) {
+	dir, err := ioutil.TempDir("", "test_index_postings")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	fn := filepath.Join(dir, indexFilename)
+
+	iw, err := NewWriter(fn)
+	testutil.Ok(t, err)
+
+	series := []labels.Labels{
+		labels.FromStrings("a", "1", "b", "1"),
+		labels.FromStrings("a", "1", "b", "2"),
+		labels.FromStrings("a", "1", "b", "3"),
+		labels.FromStrings("a", "1", "b", "4"),
+	}
+
+	err = iw.AddSymbols(map[string]struct{}{
+		"a": {},
+		"b": {},
+		"1": {},
+		"2": {},
+		"3": {},
+		"4": {},
+	})
+	testutil.Ok(t, err)
+
+	// Postings lists are only written if a series with the respective
+	// reference was added before.
+	testutil.Ok(t, iw.AddSeries(1, series[0]))
+	testutil.Ok(t, iw.AddSeries(2, series[1]))
+	testutil.Ok(t, iw.AddSeries(3, series[2]))
+	testutil.Ok(t, iw.AddSeries(4, series[3]))
+
+	err = iw.WritePostings("a", "1", newListPostings(1, 2, 3, 4))
+	testutil.Ok(t, err)
+
+	testutil.Ok(t, iw.Close())
+
+	ir, err := NewFileReader(fn)
+	testutil.Ok(t, err)
+
+	p, err := ir.Postings("a", "1")
+	testutil.Ok(t, err)
+
+	var l labels.Labels
+	var c []chunks.Meta
+
+	for i := 0; p.Next(); i++ {
+		err := ir.Series(p.At(), &l, &c)
+
+		testutil.Ok(t, err)
+		testutil.Equals(t, 0, len(c))
+		testutil.Equals(t, series[i], l)
+	}
+	testutil.Ok(t, p.Err())
+
+	testutil.Ok(t, ir.Close())
+}
+
+func TestPersistence_index_e2e(t *testing.T) {
+	dir, err := ioutil.TempDir("", "test_persistence_e2e")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	lbls, err := labels.ReadLabels(filepath.Join("..", "testdata", "20kseries.json"), 20000)
+	testutil.Ok(t, err)
+
+	// Sort labels as the index writer expects series in sorted order.
+	sort.Sort(labels.Slice(lbls))
+
+	symbols := map[string]struct{}{}
+	for _, lset := range lbls {
+		for _, l := range lset {
+			symbols[l.Name] = struct{}{}
+			symbols[l.Value] = struct{}{}
+		}
+	}
+
+	var input indexWriterSeriesSlice
+
+	// Generate ChunkMetas for every label set.
+	for i, lset := range lbls {
+		var metas []chunks.Meta
+
+		for j := 0; j <= (i % 20); j++ {
+			metas = append(metas, chunks.Meta{
+				MinTime: int64(j * 10000),
+				MaxTime: int64((j + 1) * 10000),
+				Ref:     rand.Uint64(),
+				Chunk:   chunkenc.NewXORChunk(),
+			})
+		}
+		input = append(input, &indexWriterSeries{
+			labels: lset,
+			chunks: metas,
+		})
+	}
+
+	iw, err := NewWriter(filepath.Join(dir, indexFilename))
+	testutil.Ok(t, err)
+
+	testutil.Ok(t, iw.AddSymbols(symbols))
+
+	// Population procedure as done by compaction.
+	var (
+		postings = NewMemPostings()
+		values   = map[string]map[string]struct{}{}
+	)
+
+	mi := newMockIndex()
+
+	for i, s := range input {
+		err = iw.AddSeries(uint64(i), s.labels, s.chunks...)
+		testutil.Ok(t, err)
+		testutil.Ok(t, mi.AddSeries(uint64(i), s.labels, s.chunks...))
+
+		for _, l := range s.labels {
+			valset, ok := values[l.Name]
+			if !ok {
+				valset = map[string]struct{}{}
+				values[l.Name] = valset
+			}
+			valset[l.Value] = struct{}{}
+		}
+		postings.Add(uint64(i), s.labels)
+	}
+
+	for k, v := range values {
+		var vals []string
+		for e := range v {
+			vals = append(vals, e)
+		}
+		sort.Strings(vals)
+
+		testutil.Ok(t, iw.WriteLabelIndex([]string{k}, vals))
+		testutil.Ok(t, mi.WriteLabelIndex([]string{k}, vals))
+	}
+
+	all := make([]uint64, len(lbls))
+	for i := range all {
+		all[i] = uint64(i)
+	}
+	err = iw.WritePostings("", "", newListPostings(all...))
+	testutil.Ok(t, err)
+	testutil.Ok(t, mi.WritePostings("", "", newListPostings(all...)))
+
+	for n, e := range postings.m {
+		for v := range e {
+			err = iw.WritePostings(n, v, postings.Get(n, v))
+			testutil.Ok(t, err)
+			mi.WritePostings(n, v, postings.Get(n, v))
+		}
+	}
+
+	err = iw.Close()
+	testutil.Ok(t, err)
+
+	ir, err := NewFileReader(filepath.Join(dir, indexFilename))
+	testutil.Ok(t, err)
+
+	for p := range mi.postings {
+		gotp, err := ir.Postings(p.Name, p.Value)
+		testutil.Ok(t, err)
+
+		expp, err := mi.Postings(p.Name, p.Value)
+		testutil.Ok(t, err)
+
+		var lset, explset labels.Labels
+		var chks, expchks []chunks.Meta
+
+		for gotp.Next() {
+			testutil.Assert(t, expp.Next() == true, "")
+
+			ref := gotp.At()
+
+			err := ir.Series(ref, &lset, &chks)
+			testutil.Ok(t, err)
+
+			err = mi.Series(expp.At(), &explset, &expchks)
+			testutil.Ok(t, err)
+			testutil.Equals(t, explset, lset)
+			testutil.Equals(t, expchks, chks)
+		}
+		testutil.Assert(t, expp.Next() == false, "")
+		testutil.Ok(t, gotp.Err())
+	}
+
+	for k, v := range mi.labelIndex {
+		tplsExp, err := NewStringTuples(v, 1)
+		testutil.Ok(t, err)
+
+		tplsRes, err := ir.LabelValues(k)
+		testutil.Ok(t, err)
+
+		testutil.Equals(t, tplsExp.Len(), tplsRes.Len())
+		for i := 0; i < tplsExp.Len(); i++ {
+			strsExp, err := tplsExp.At(i)
+			testutil.Ok(t, err)
+
+			strsRes, err := tplsRes.At(i)
+			testutil.Ok(t, err)
+
+			testutil.Equals(t, strsExp, strsRes)
+		}
+	}
+
+	gotSymbols, err := ir.Symbols()
+	testutil.Ok(t, err)
+
+	testutil.Equals(t, len(mi.symbols), len(gotSymbols))
+	for s := range mi.symbols {
+		_, ok := gotSymbols[s]
+		testutil.Assert(t, ok, "")
+	}
+
+	testutil.Ok(t, ir.Close())
+}
+
+func TestDecbufUvariantWithInvalidBuffer(t *testing.T) {
+	b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81})
+
+	db := encoding.NewDecbufUvarintAt(b, 0, castagnoliTable)
+	testutil.NotOk(t, db.Err())
+}
+
+func TestReaderWithInvalidBuffer(t *testing.T) {
+	b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81})
+
+	_, err := NewReader(b)
+	testutil.NotOk(t, err)
+}
+
+// TestNewFileReaderErrorNoOpenFiles ensures that in case of an error no file remains open.
+func TestNewFileReaderErrorNoOpenFiles(t *testing.T) {
+	dir := testutil.NewTemporaryDirectory("block", t)
+
+	idxName := filepath.Join(dir.Path(), "index")
+	err := ioutil.WriteFile(idxName, []byte("corrupted contents"), 0644)
+	testutil.Ok(t, err)
+
+	_, err = NewFileReader(idxName)
+	testutil.NotOk(t, err)
+
+	// dir.Close will fail on Win if idxName fd is not closed on error path.
+	dir.Close()
+}
--- a/tsdb/index/postings.go
+++ b/tsdb/index/postings.go
@ -0,0 +1,691 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package index
+
+import (
+	"container/heap"
+	"encoding/binary"
+	"runtime"
+	"sort"
+	"strings"
+	"sync"
+
+	"github.com/prometheus/tsdb/labels"
+)
+
+var allPostingsKey = labels.Label{}
+
+// AllPostingsKey returns the label key that is used to store the postings list of all existing IDs.
+func AllPostingsKey() (name, value string) {
+	return allPostingsKey.Name, allPostingsKey.Value
+}
+
+// MemPostings holds postings list for series ID per label pair. They may be written
+// to out of order.
+// ensureOrder() must be called once before any reads are done. This allows for quick
+// unordered batch fills on startup.
+type MemPostings struct {
+	mtx     sync.RWMutex
+	m       map[string]map[string][]uint64
+	ordered bool
+}
+
+// NewMemPostings returns a memPostings that's ready for reads and writes.
+func NewMemPostings() *MemPostings {
+	return &MemPostings{
+		m:       make(map[string]map[string][]uint64, 512),
+		ordered: true,
+	}
+}
+
+// NewUnorderedMemPostings returns a memPostings that is not safe to be read from
+// until ensureOrder was called once.
+func NewUnorderedMemPostings() *MemPostings {
+	return &MemPostings{
+		m:       make(map[string]map[string][]uint64, 512),
+		ordered: false,
+	}
+}
+
+// SortedKeys returns a list of sorted label keys of the postings.
+func (p *MemPostings) SortedKeys() []labels.Label {
+	p.mtx.RLock()
+	keys := make([]labels.Label, 0, len(p.m))
+
+	for n, e := range p.m {
+		for v := range e {
+			keys = append(keys, labels.Label{Name: n, Value: v})
+		}
+	}
+	p.mtx.RUnlock()
+
+	sort.Slice(keys, func(i, j int) bool {
+		if d := strings.Compare(keys[i].Name, keys[j].Name); d != 0 {
+			return d < 0
+		}
+		return keys[i].Value < keys[j].Value
+	})
+	return keys
+}
+
+// Get returns a postings list for the given label pair.
+func (p *MemPostings) Get(name, value string) Postings {
+	var lp []uint64
+	p.mtx.RLock()
+	l := p.m[name]
+	if l != nil {
+		lp = l[value]
+	}
+	p.mtx.RUnlock()
+
+	if lp == nil {
+		return EmptyPostings()
+	}
+	return newListPostings(lp...)
+}
+
+// All returns a postings list over all documents ever added.
+func (p *MemPostings) All() Postings {
+	return p.Get(AllPostingsKey())
+}
+
+// EnsureOrder ensures that all postings lists are sorted. After it returns all further
+// calls to add and addFor will insert new IDs in a sorted manner.
+func (p *MemPostings) EnsureOrder() {
+	p.mtx.Lock()
+	defer p.mtx.Unlock()
+
+	if p.ordered {
+		return
+	}
+
+	n := runtime.GOMAXPROCS(0)
+	workc := make(chan []uint64)
+
+	var wg sync.WaitGroup
+	wg.Add(n)
+
+	for i := 0; i < n; i++ {
+		go func() {
+			for l := range workc {
+				sort.Slice(l, func(i, j int) bool { return l[i] < l[j] })
+			}
+			wg.Done()
+		}()
+	}
+
+	for _, e := range p.m {
+		for _, l := range e {
+			workc <- l
+		}
+	}
+	close(workc)
+	wg.Wait()
+
+	p.ordered = true
+}
+
+// Delete removes all ids in the given map from the postings lists.
+func (p *MemPostings) Delete(deleted map[uint64]struct{}) {
+	var keys, vals []string
+
+	// Collect all keys relevant for deletion once. New keys added afterwards
+	// can by definition not be affected by any of the given deletes.
+	p.mtx.RLock()
+	for n := range p.m {
+		keys = append(keys, n)
+	}
+	p.mtx.RUnlock()
+
+	for _, n := range keys {
+		p.mtx.RLock()
+		vals = vals[:0]
+		for v := range p.m[n] {
+			vals = append(vals, v)
+		}
+		p.mtx.RUnlock()
+
+		// For each posting we first analyse whether the postings list is affected by the deletes.
+		// If yes, we actually reallocate a new postings list.
+		for _, l := range vals {
+			// Only lock for processing one postings list so we don't block reads for too long.
+			p.mtx.Lock()
+
+			found := false
+			for _, id := range p.m[n][l] {
+				if _, ok := deleted[id]; ok {
+					found = true
+					break
+				}
+			}
+			if !found {
+				p.mtx.Unlock()
+				continue
+			}
+			repl := make([]uint64, 0, len(p.m[n][l]))
+
+			for _, id := range p.m[n][l] {
+				if _, ok := deleted[id]; !ok {
+					repl = append(repl, id)
+				}
+			}
+			if len(repl) > 0 {
+				p.m[n][l] = repl
+			} else {
+				delete(p.m[n], l)
+			}
+			p.mtx.Unlock()
+		}
+		p.mtx.Lock()
+		if len(p.m[n]) == 0 {
+			delete(p.m, n)
+		}
+		p.mtx.Unlock()
+	}
+}
+
+// Iter calls f for each postings list. It aborts if f returns an error and returns it.
+func (p *MemPostings) Iter(f func(labels.Label, Postings) error) error {
+	p.mtx.RLock()
+	defer p.mtx.RUnlock()
+
+	for n, e := range p.m {
+		for v, p := range e {
+			if err := f(labels.Label{Name: n, Value: v}, newListPostings(p...)); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// Add a label set to the postings index.
+func (p *MemPostings) Add(id uint64, lset labels.Labels) {
+	p.mtx.Lock()
+
+	for _, l := range lset {
+		p.addFor(id, l)
+	}
+	p.addFor(id, allPostingsKey)
+
+	p.mtx.Unlock()
+}
+
+func (p *MemPostings) addFor(id uint64, l labels.Label) {
+	nm, ok := p.m[l.Name]
+	if !ok {
+		nm = map[string][]uint64{}
+		p.m[l.Name] = nm
+	}
+	list := append(nm[l.Value], id)
+	nm[l.Value] = list
+
+	if !p.ordered {
+		return
+	}
+	// There is no guarantee that no higher ID was inserted before as they may
+	// be generated independently before adding them to postings.
+	// We repair order violations on insert. The invariant is that the first n-1
+	// items in the list are already sorted.
+	for i := len(list) - 1; i >= 1; i-- {
+		if list[i] >= list[i-1] {
+			break
+		}
+		list[i], list[i-1] = list[i-1], list[i]
+	}
+}
+
+// ExpandPostings returns the postings expanded as a slice.
+func ExpandPostings(p Postings) (res []uint64, err error) {
+	for p.Next() {
+		res = append(res, p.At())
+	}
+	return res, p.Err()
+}
+
+// Postings provides iterative access over a postings list.
+type Postings interface {
+	// Next advances the iterator and returns true if another value was found.
+	Next() bool
+
+	// Seek advances the iterator to value v or greater and returns
+	// true if a value was found.
+	Seek(v uint64) bool
+
+	// At returns the value at the current iterator position.
+	At() uint64
+
+	// Err returns the last error of the iterator.
+	Err() error
+}
+
+// errPostings is an empty iterator that always errors.
+type errPostings struct {
+	err error
+}
+
+func (e errPostings) Next() bool       { return false }
+func (e errPostings) Seek(uint64) bool { return false }
+func (e errPostings) At() uint64       { return 0 }
+func (e errPostings) Err() error       { return e.err }
+
+var emptyPostings = errPostings{}
+
+// EmptyPostings returns a postings list that's always empty.
+// NOTE: Returning EmptyPostings sentinel when index.Postings struct has no postings is recommended.
+// It triggers optimized flow in other functions like Intersect, Without etc.
+func EmptyPostings() Postings {
+	return emptyPostings
+}
+
+// ErrPostings returns new postings that immediately error.
+func ErrPostings(err error) Postings {
+	return errPostings{err}
+}
+
+// Intersect returns a new postings list over the intersection of the
+// input postings.
+func Intersect(its ...Postings) Postings {
+	if len(its) == 0 {
+		return EmptyPostings()
+	}
+	if len(its) == 1 {
+		return its[0]
+	}
+	for _, p := range its {
+		if p == EmptyPostings() {
+			return EmptyPostings()
+		}
+	}
+
+	return newIntersectPostings(its...)
+}
+
+type intersectPostings struct {
+	arr []Postings
+	cur uint64
+}
+
+func newIntersectPostings(its ...Postings) *intersectPostings {
+	return &intersectPostings{arr: its}
+}
+
+func (it *intersectPostings) At() uint64 {
+	return it.cur
+}
+
+func (it *intersectPostings) doNext() bool {
+Loop:
+	for {
+		for _, p := range it.arr {
+			if !p.Seek(it.cur) {
+				return false
+			}
+			if p.At() > it.cur {
+				it.cur = p.At()
+				continue Loop
+			}
+		}
+		return true
+	}
+}
+
+func (it *intersectPostings) Next() bool {
+	for _, p := range it.arr {
+		if !p.Next() {
+			return false
+		}
+		if p.At() > it.cur {
+			it.cur = p.At()
+		}
+	}
+	return it.doNext()
+}
+
+func (it *intersectPostings) Seek(id uint64) bool {
+	it.cur = id
+	return it.doNext()
+}
+
+func (it *intersectPostings) Err() error {
+	for _, p := range it.arr {
+		if p.Err() != nil {
+			return p.Err()
+		}
+	}
+	return nil
+}
+
+// Merge returns a new iterator over the union of the input iterators.
+func Merge(its ...Postings) Postings {
+	if len(its) == 0 {
+		return EmptyPostings()
+	}
+	if len(its) == 1 {
+		return its[0]
+	}
+
+	p, ok := newMergedPostings(its)
+	if !ok {
+		return EmptyPostings()
+	}
+	return p
+}
+
+type postingsHeap []Postings
+
+func (h postingsHeap) Len() int           { return len(h) }
+func (h postingsHeap) Less(i, j int) bool { return h[i].At() < h[j].At() }
+func (h *postingsHeap) Swap(i, j int)     { (*h)[i], (*h)[j] = (*h)[j], (*h)[i] }
+
+func (h *postingsHeap) Push(x interface{}) {
+	*h = append(*h, x.(Postings))
+}
+
+func (h *postingsHeap) Pop() interface{} {
+	old := *h
+	n := len(old)
+	x := old[n-1]
+	*h = old[0 : n-1]
+	return x
+}
+
+type mergedPostings struct {
+	h          postingsHeap
+	initilized bool
+	cur        uint64
+	err        error
+}
+
+func newMergedPostings(p []Postings) (m *mergedPostings, nonEmpty bool) {
+	ph := make(postingsHeap, 0, len(p))
+
+	for _, it := range p {
+		// NOTE: mergedPostings struct requires the user to issue an initial Next.
+		if it.Next() {
+			ph = append(ph, it)
+		} else {
+			if it.Err() != nil {
+				return &mergedPostings{err: it.Err()}, true
+			}
+		}
+	}
+
+	if len(ph) == 0 {
+		return nil, false
+	}
+	return &mergedPostings{h: ph}, true
+}
+
+func (it *mergedPostings) Next() bool {
+	if it.h.Len() == 0 || it.err != nil {
+		return false
+	}
+
+	// The user must issue an initial Next.
+	if !it.initilized {
+		heap.Init(&it.h)
+		it.cur = it.h[0].At()
+		it.initilized = true
+		return true
+	}
+
+	for {
+		cur := it.h[0]
+		if !cur.Next() {
+			heap.Pop(&it.h)
+			if cur.Err() != nil {
+				it.err = cur.Err()
+				return false
+			}
+			if it.h.Len() == 0 {
+				return false
+			}
+		} else {
+			// Value of top of heap has changed, re-heapify.
+			heap.Fix(&it.h, 0)
+		}
+
+		if it.h[0].At() != it.cur {
+			it.cur = it.h[0].At()
+			return true
+		}
+	}
+}
+
+func (it *mergedPostings) Seek(id uint64) bool {
+	if it.h.Len() == 0 || it.err != nil {
+		return false
+	}
+	if !it.initilized {
+		if !it.Next() {
+			return false
+		}
+	}
+	for it.cur < id {
+		cur := it.h[0]
+		if !cur.Seek(id) {
+			heap.Pop(&it.h)
+			if cur.Err() != nil {
+				it.err = cur.Err()
+				return false
+			}
+			if it.h.Len() == 0 {
+				return false
+			}
+		} else {
+			// Value of top of heap has changed, re-heapify.
+			heap.Fix(&it.h, 0)
+		}
+
+		it.cur = it.h[0].At()
+	}
+	return true
+}
+
+func (it mergedPostings) At() uint64 {
+	return it.cur
+}
+
+func (it mergedPostings) Err() error {
+	return it.err
+}
+
+// Without returns a new postings list that contains all elements from the full list that
+// are not in the drop list.
+func Without(full, drop Postings) Postings {
+	if full == EmptyPostings() {
+		return EmptyPostings()
+	}
+
+	if drop == EmptyPostings() {
+		return full
+	}
+	return newRemovedPostings(full, drop)
+}
+
+type removedPostings struct {
+	full, remove Postings
+
+	cur uint64
+
+	initialized bool
+	fok, rok    bool
+}
+
+func newRemovedPostings(full, remove Postings) *removedPostings {
+	return &removedPostings{
+		full:   full,
+		remove: remove,
+	}
+}
+
+func (rp *removedPostings) At() uint64 {
+	return rp.cur
+}
+
+func (rp *removedPostings) Next() bool {
+	if !rp.initialized {
+		rp.fok = rp.full.Next()
+		rp.rok = rp.remove.Next()
+		rp.initialized = true
+	}
+	for {
+		if !rp.fok {
+			return false
+		}
+
+		if !rp.rok {
+			rp.cur = rp.full.At()
+			rp.fok = rp.full.Next()
+			return true
+		}
+
+		fcur, rcur := rp.full.At(), rp.remove.At()
+		if fcur < rcur {
+			rp.cur = fcur
+			rp.fok = rp.full.Next()
+
+			return true
+		} else if rcur < fcur {
+			// Forward the remove postings to the right position.
+			rp.rok = rp.remove.Seek(fcur)
+		} else {
+			// Skip the current posting.
+			rp.fok = rp.full.Next()
+		}
+	}
+}
+
+func (rp *removedPostings) Seek(id uint64) bool {
+	if rp.cur >= id {
+		return true
+	}
+
+	rp.fok = rp.full.Seek(id)
+	rp.rok = rp.remove.Seek(id)
+	rp.initialized = true
+
+	return rp.Next()
+}
+
+func (rp *removedPostings) Err() error {
+	if rp.full.Err() != nil {
+		return rp.full.Err()
+	}
+
+	return rp.remove.Err()
+}
+
+// ListPostings implements the Postings interface over a plain list.
+type ListPostings struct {
+	list []uint64
+	cur  uint64
+}
+
+func NewListPostings(list []uint64) Postings {
+	return newListPostings(list...)
+}
+
+func newListPostings(list ...uint64) *ListPostings {
+	return &ListPostings{list: list}
+}
+
+func (it *ListPostings) At() uint64 {
+	return it.cur
+}
+
+func (it *ListPostings) Next() bool {
+	if len(it.list) > 0 {
+		it.cur = it.list[0]
+		it.list = it.list[1:]
+		return true
+	}
+	it.cur = 0
+	return false
+}
+
+func (it *ListPostings) Seek(x uint64) bool {
+	// If the current value satisfies, then return.
+	if it.cur >= x {
+		return true
+	}
+	if len(it.list) == 0 {
+		return false
+	}
+
+	// Do binary search between current position and end.
+	i := sort.Search(len(it.list), func(i int) bool {
+		return it.list[i] >= x
+	})
+	if i < len(it.list) {
+		it.cur = it.list[i]
+		it.list = it.list[i+1:]
+		return true
+	}
+	it.list = nil
+	return false
+}
+
+func (it *ListPostings) Err() error {
+	return nil
+}
+
+// bigEndianPostings implements the Postings interface over a byte stream of
+// big endian numbers.
+type bigEndianPostings struct {
+	list []byte
+	cur  uint32
+}
+
+func newBigEndianPostings(list []byte) *bigEndianPostings {
+	return &bigEndianPostings{list: list}
+}
+
+func (it *bigEndianPostings) At() uint64 {
+	return uint64(it.cur)
+}
+
+func (it *bigEndianPostings) Next() bool {
+	if len(it.list) >= 4 {
+		it.cur = binary.BigEndian.Uint32(it.list)
+		it.list = it.list[4:]
+		return true
+	}
+	return false
+}
+
+func (it *bigEndianPostings) Seek(x uint64) bool {
+	if uint64(it.cur) >= x {
+		return true
+	}
+
+	num := len(it.list) / 4
+	// Do binary search between current position and end.
+	i := sort.Search(num, func(i int) bool {
+		return binary.BigEndian.Uint32(it.list[i*4:]) >= uint32(x)
+	})
+	if i < num {
+		j := i * 4
+		it.cur = binary.BigEndian.Uint32(it.list[j:])
+		it.list = it.list[j+4:]
+		return true
+	}
+	it.list = nil
+	return false
+}
+
+func (it *bigEndianPostings) Err() error {
+	return nil
+}
--- a/tsdb/index/postings_test.go
+++ b/tsdb/index/postings_test.go
@ -0,0 +1,814 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package index
+
+import (
+	"encoding/binary"
+	"fmt"
+	"math/rand"
+	"sort"
+	"testing"
+
+	"github.com/prometheus/tsdb/testutil"
+)
+
+func TestMemPostings_addFor(t *testing.T) {
+	p := NewMemPostings()
+	p.m[allPostingsKey.Name] = map[string][]uint64{}
+	p.m[allPostingsKey.Name][allPostingsKey.Value] = []uint64{1, 2, 3, 4, 6, 7, 8}
+
+	p.addFor(5, allPostingsKey)
+
+	testutil.Equals(t, []uint64{1, 2, 3, 4, 5, 6, 7, 8}, p.m[allPostingsKey.Name][allPostingsKey.Value])
+}
+
+func TestMemPostings_ensureOrder(t *testing.T) {
+	p := NewUnorderedMemPostings()
+	p.m["a"] = map[string][]uint64{}
+
+	for i := 0; i < 100; i++ {
+		l := make([]uint64, 100)
+		for j := range l {
+			l[j] = rand.Uint64()
+		}
+		v := fmt.Sprintf("%d", i)
+
+		p.m["a"][v] = l
+	}
+
+	p.EnsureOrder()
+
+	for _, e := range p.m {
+		for _, l := range e {
+			ok := sort.SliceIsSorted(l, func(i, j int) bool {
+				return l[i] < l[j]
+			})
+			if !ok {
+				t.Fatalf("postings list %v is not sorted", l)
+			}
+		}
+	}
+}
+
+func TestIntersect(t *testing.T) {
+	a := newListPostings(1, 2, 3)
+	b := newListPostings(2, 3, 4)
+
+	var cases = []struct {
+		in []Postings
+
+		res Postings
+	}{
+		{
+			in:  []Postings{},
+			res: EmptyPostings(),
+		},
+		{
+			in:  []Postings{a, b, EmptyPostings()},
+			res: EmptyPostings(),
+		},
+		{
+			in:  []Postings{b, a, EmptyPostings()},
+			res: EmptyPostings(),
+		},
+		{
+			in:  []Postings{EmptyPostings(), b, a},
+			res: EmptyPostings(),
+		},
+		{
+			in:  []Postings{EmptyPostings(), a, b},
+			res: EmptyPostings(),
+		},
+		{
+			in:  []Postings{a, EmptyPostings(), b},
+			res: EmptyPostings(),
+		},
+		{
+			in:  []Postings{b, EmptyPostings(), a},
+			res: EmptyPostings(),
+		},
+		{
+			in:  []Postings{b, EmptyPostings(), a, a, b, a, a, a},
+			res: EmptyPostings(),
+		},
+		{
+			in: []Postings{
+				newListPostings(1, 2, 3, 4, 5),
+				newListPostings(6, 7, 8, 9, 10),
+			},
+			res: newListPostings(),
+		},
+		{
+			in: []Postings{
+				newListPostings(1, 2, 3, 4, 5),
+				newListPostings(4, 5, 6, 7, 8),
+			},
+			res: newListPostings(4, 5),
+		},
+		{
+			in: []Postings{
+				newListPostings(1, 2, 3, 4, 9, 10),
+				newListPostings(1, 4, 5, 6, 7, 8, 10, 11),
+			},
+			res: newListPostings(1, 4, 10),
+		},
+		{
+			in: []Postings{
+				newListPostings(1),
+				newListPostings(0, 1),
+			},
+			res: newListPostings(1),
+		},
+		{
+			in: []Postings{
+				newListPostings(1),
+			},
+			res: newListPostings(1),
+		},
+		{
+			in: []Postings{
+				newListPostings(1),
+				newListPostings(),
+			},
+			res: newListPostings(),
+		},
+		{
+			in: []Postings{
+				newListPostings(),
+				newListPostings(),
+			},
+			res: newListPostings(),
+		},
+	}
+
+	for _, c := range cases {
+		t.Run("", func(t *testing.T) {
+			if c.res == nil {
+				t.Fatal("intersect result expectancy cannot be nil")
+			}
+
+			expected, err := ExpandPostings(c.res)
+			testutil.Ok(t, err)
+
+			i := Intersect(c.in...)
+
+			if c.res == EmptyPostings() {
+				testutil.Equals(t, EmptyPostings(), i)
+				return
+			}
+
+			if i == EmptyPostings() {
+				t.Fatal("intersect unexpected result: EmptyPostings sentinel")
+			}
+
+			res, err := ExpandPostings(i)
+			testutil.Ok(t, err)
+			testutil.Equals(t, expected, res)
+		})
+	}
+}
+
+func TestMultiIntersect(t *testing.T) {
+	var cases = []struct {
+		p   [][]uint64
+		res []uint64
+	}{
+		{
+			p: [][]uint64{
+				{1, 2, 3, 4, 5, 6, 1000, 1001},
+				{2, 4, 5, 6, 7, 8, 999, 1001},
+				{1, 2, 5, 6, 7, 8, 1001, 1200},
+			},
+			res: []uint64{2, 5, 6, 1001},
+		},
+		// One of the reproduceable cases for:
+		// https://github.com/prometheus/prometheus/issues/2616
+		// The initialisation of intersectPostings was moving the iterator forward
+		// prematurely making us miss some postings.
+		{
+			p: [][]uint64{
+				{1, 2},
+				{1, 2},
+				{1, 2},
+				{2},
+			},
+			res: []uint64{2},
+		},
+	}
+
+	for _, c := range cases {
+		ps := make([]Postings, 0, len(c.p))
+		for _, postings := range c.p {
+			ps = append(ps, newListPostings(postings...))
+		}
+
+		res, err := ExpandPostings(Intersect(ps...))
+
+		testutil.Ok(t, err)
+		testutil.Equals(t, c.res, res)
+	}
+}
+
+func BenchmarkIntersect(t *testing.B) {
+	t.Run("LongPostings1", func(bench *testing.B) {
+		var a, b, c, d []uint64
+
+		for i := 0; i < 10000000; i += 2 {
+			a = append(a, uint64(i))
+		}
+		for i := 5000000; i < 5000100; i += 4 {
+			b = append(b, uint64(i))
+		}
+		for i := 5090000; i < 5090600; i += 4 {
+			b = append(b, uint64(i))
+		}
+		for i := 4990000; i < 5100000; i++ {
+			c = append(c, uint64(i))
+		}
+		for i := 4000000; i < 6000000; i++ {
+			d = append(d, uint64(i))
+		}
+
+		i1 := newListPostings(a...)
+		i2 := newListPostings(b...)
+		i3 := newListPostings(c...)
+		i4 := newListPostings(d...)
+
+		bench.ResetTimer()
+		bench.ReportAllocs()
+		for i := 0; i < bench.N; i++ {
+			if _, err := ExpandPostings(Intersect(i1, i2, i3, i4)); err != nil {
+				bench.Fatal(err)
+			}
+		}
+	})
+
+	t.Run("LongPostings2", func(bench *testing.B) {
+		var a, b, c, d []uint64
+
+		for i := 0; i < 12500000; i++ {
+			a = append(a, uint64(i))
+		}
+		for i := 7500000; i < 12500000; i++ {
+			b = append(b, uint64(i))
+		}
+		for i := 9000000; i < 20000000; i++ {
+			c = append(c, uint64(i))
+		}
+		for i := 10000000; i < 12000000; i++ {
+			d = append(d, uint64(i))
+		}
+
+		i1 := newListPostings(a...)
+		i2 := newListPostings(b...)
+		i3 := newListPostings(c...)
+		i4 := newListPostings(d...)
+
+		bench.ResetTimer()
+		bench.ReportAllocs()
+		for i := 0; i < bench.N; i++ {
+			if _, err := ExpandPostings(Intersect(i1, i2, i3, i4)); err != nil {
+				bench.Fatal(err)
+			}
+		}
+	})
+
+	// Many matchers(k >> n).
+	t.Run("ManyPostings", func(bench *testing.B) {
+		var its []Postings
+
+		// 100000 matchers(k=100000).
+		for i := 0; i < 100000; i++ {
+			var temp []uint64
+			for j := 1; j < 100; j++ {
+				temp = append(temp, uint64(j))
+			}
+			its = append(its, newListPostings(temp...))
+		}
+
+		bench.ResetTimer()
+		bench.ReportAllocs()
+		for i := 0; i < bench.N; i++ {
+			if _, err := ExpandPostings(Intersect(its...)); err != nil {
+				bench.Fatal(err)
+			}
+		}
+	})
+}
+
+func TestMultiMerge(t *testing.T) {
+	i1 := newListPostings(1, 2, 3, 4, 5, 6, 1000, 1001)
+	i2 := newListPostings(2, 4, 5, 6, 7, 8, 999, 1001)
+	i3 := newListPostings(1, 2, 5, 6, 7, 8, 1001, 1200)
+
+	res, err := ExpandPostings(Merge(i1, i2, i3))
+	testutil.Ok(t, err)
+	testutil.Equals(t, []uint64{1, 2, 3, 4, 5, 6, 7, 8, 999, 1000, 1001, 1200}, res)
+}
+
+func TestMergedPostings(t *testing.T) {
+	var cases = []struct {
+		in []Postings
+
+		res Postings
+	}{
+		{
+			in:  []Postings{},
+			res: EmptyPostings(),
+		},
+		{
+			in: []Postings{
+				newListPostings(),
+				newListPostings(),
+			},
+			res: EmptyPostings(),
+		},
+		{
+			in: []Postings{
+				newListPostings(),
+			},
+			res: newListPostings(),
+		},
+		{
+			in: []Postings{
+				EmptyPostings(),
+				EmptyPostings(),
+				EmptyPostings(),
+				EmptyPostings(),
+			},
+			res: EmptyPostings(),
+		},
+		{
+			in: []Postings{
+				newListPostings(1, 2, 3, 4, 5),
+				newListPostings(6, 7, 8, 9, 10),
+			},
+			res: newListPostings(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
+		},
+		{
+			in: []Postings{
+				newListPostings(1, 2, 3, 4, 5),
+				newListPostings(4, 5, 6, 7, 8),
+			},
+			res: newListPostings(1, 2, 3, 4, 5, 6, 7, 8),
+		},
+		{
+			in: []Postings{
+				newListPostings(1, 2, 3, 4, 9, 10),
+				newListPostings(1, 4, 5, 6, 7, 8, 10, 11),
+			},
+			res: newListPostings(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
+		},
+		{
+			in: []Postings{
+				newListPostings(1, 2, 3, 4, 9, 10),
+				EmptyPostings(),
+				newListPostings(1, 4, 5, 6, 7, 8, 10, 11),
+			},
+			res: newListPostings(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
+		},
+		{
+			in: []Postings{
+				newListPostings(1, 2),
+				newListPostings(),
+			},
+			res: newListPostings(1, 2),
+		},
+		{
+			in: []Postings{
+				newListPostings(1, 2),
+				EmptyPostings(),
+			},
+			res: newListPostings(1, 2),
+		},
+	}
+
+	for _, c := range cases {
+		t.Run("", func(t *testing.T) {
+			if c.res == nil {
+				t.Fatal("merge result expectancy cannot be nil")
+			}
+
+			expected, err := ExpandPostings(c.res)
+			testutil.Ok(t, err)
+
+			m := Merge(c.in...)
+
+			if c.res == EmptyPostings() {
+				testutil.Equals(t, EmptyPostings(), m)
+				return
+			}
+
+			if m == EmptyPostings() {
+				t.Fatal("merge unexpected result: EmptyPostings sentinel")
+			}
+
+			res, err := ExpandPostings(m)
+			testutil.Ok(t, err)
+			testutil.Equals(t, expected, res)
+		})
+	}
+}
+
+func TestMergedPostingsSeek(t *testing.T) {
+	var cases = []struct {
+		a, b []uint64
+
+		seek    uint64
+		success bool
+		res     []uint64
+	}{
+		{
+			a: []uint64{2, 3, 4, 5},
+			b: []uint64{6, 7, 8, 9, 10},
+
+			seek:    1,
+			success: true,
+			res:     []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10},
+		},
+		{
+			a: []uint64{1, 2, 3, 4, 5},
+			b: []uint64{6, 7, 8, 9, 10},
+
+			seek:    2,
+			success: true,
+			res:     []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10},
+		},
+		{
+			a: []uint64{1, 2, 3, 4, 5},
+			b: []uint64{4, 5, 6, 7, 8},
+
+			seek:    9,
+			success: false,
+			res:     nil,
+		},
+		{
+			a: []uint64{1, 2, 3, 4, 9, 10},
+			b: []uint64{1, 4, 5, 6, 7, 8, 10, 11},
+
+			seek:    10,
+			success: true,
+			res:     []uint64{10, 11},
+		},
+	}
+
+	for _, c := range cases {
+		a := newListPostings(c.a...)
+		b := newListPostings(c.b...)
+
+		p := Merge(a, b)
+
+		testutil.Equals(t, c.success, p.Seek(c.seek))
+
+		// After Seek(), At() should be called.
+		if c.success {
+			start := p.At()
+			lst, err := ExpandPostings(p)
+			testutil.Ok(t, err)
+
+			lst = append([]uint64{start}, lst...)
+			testutil.Equals(t, c.res, lst)
+		}
+	}
+}
+
+func TestRemovedPostings(t *testing.T) {
+	var cases = []struct {
+		a, b []uint64
+		res  []uint64
+	}{
+		{
+			a:   nil,
+			b:   nil,
+			res: []uint64(nil),
+		},
+		{
+			a:   []uint64{1, 2, 3, 4},
+			b:   nil,
+			res: []uint64{1, 2, 3, 4},
+		},
+		{
+			a:   nil,
+			b:   []uint64{1, 2, 3, 4},
+			res: []uint64(nil),
+		},
+		{
+			a:   []uint64{1, 2, 3, 4, 5},
+			b:   []uint64{6, 7, 8, 9, 10},
+			res: []uint64{1, 2, 3, 4, 5},
+		},
+		{
+			a:   []uint64{1, 2, 3, 4, 5},
+			b:   []uint64{4, 5, 6, 7, 8},
+			res: []uint64{1, 2, 3},
+		},
+		{
+			a:   []uint64{1, 2, 3, 4, 9, 10},
+			b:   []uint64{1, 4, 5, 6, 7, 8, 10, 11},
+			res: []uint64{2, 3, 9},
+		},
+		{
+			a:   []uint64{1, 2, 3, 4, 9, 10},
+			b:   []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+			res: []uint64(nil),
+		},
+	}
+
+	for _, c := range cases {
+		a := newListPostings(c.a...)
+		b := newListPostings(c.b...)
+
+		res, err := ExpandPostings(newRemovedPostings(a, b))
+		testutil.Ok(t, err)
+		testutil.Equals(t, c.res, res)
+	}
+
+}
+
+func TestRemovedNextStackoverflow(t *testing.T) {
+	var full []uint64
+	var remove []uint64
+
+	var i uint64
+	for i = 0; i < 1e7; i++ {
+		full = append(full, i)
+		remove = append(remove, i)
+	}
+
+	flp := newListPostings(full...)
+	rlp := newListPostings(remove...)
+	rp := newRemovedPostings(flp, rlp)
+	gotElem := false
+	for rp.Next() {
+		gotElem = true
+	}
+
+	testutil.Ok(t, rp.Err())
+	testutil.Assert(t, !gotElem, "")
+}
+
+func TestRemovedPostingsSeek(t *testing.T) {
+	var cases = []struct {
+		a, b []uint64
+
+		seek    uint64
+		success bool
+		res     []uint64
+	}{
+		{
+			a: []uint64{2, 3, 4, 5},
+			b: []uint64{6, 7, 8, 9, 10},
+
+			seek:    1,
+			success: true,
+			res:     []uint64{2, 3, 4, 5},
+		},
+		{
+			a: []uint64{1, 2, 3, 4, 5},
+			b: []uint64{6, 7, 8, 9, 10},
+
+			seek:    2,
+			success: true,
+			res:     []uint64{2, 3, 4, 5},
+		},
+		{
+			a: []uint64{1, 2, 3, 4, 5},
+			b: []uint64{4, 5, 6, 7, 8},
+
+			seek:    9,
+			success: false,
+			res:     nil,
+		},
+		{
+			a: []uint64{1, 2, 3, 4, 9, 10},
+			b: []uint64{1, 4, 5, 6, 7, 8, 10, 11},
+
+			seek:    10,
+			success: false,
+			res:     nil,
+		},
+		{
+			a: []uint64{1, 2, 3, 4, 9, 10},
+			b: []uint64{1, 4, 5, 6, 7, 8, 11},
+
+			seek:    4,
+			success: true,
+			res:     []uint64{9, 10},
+		},
+		{
+			a: []uint64{1, 2, 3, 4, 9, 10},
+			b: []uint64{1, 4, 5, 6, 7, 8, 11},
+
+			seek:    5,
+			success: true,
+			res:     []uint64{9, 10},
+		},
+		{
+			a: []uint64{1, 2, 3, 4, 9, 10},
+			b: []uint64{1, 4, 5, 6, 7, 8, 11},
+
+			seek:    10,
+			success: true,
+			res:     []uint64{10},
+		},
+	}
+
+	for _, c := range cases {
+		a := newListPostings(c.a...)
+		b := newListPostings(c.b...)
+
+		p := newRemovedPostings(a, b)
+
+		testutil.Equals(t, c.success, p.Seek(c.seek))
+
+		// After Seek(), At() should be called.
+		if c.success {
+			start := p.At()
+			lst, err := ExpandPostings(p)
+			testutil.Ok(t, err)
+
+			lst = append([]uint64{start}, lst...)
+			testutil.Equals(t, c.res, lst)
+		}
+	}
+}
+
+func TestBigEndian(t *testing.T) {
+	num := 1000
+	// mock a list as postings
+	ls := make([]uint32, num)
+	ls[0] = 2
+	for i := 1; i < num; i++ {
+		ls[i] = ls[i-1] + uint32(rand.Int31n(25)) + 2
+	}
+
+	beLst := make([]byte, num*4)
+	for i := 0; i < num; i++ {
+		b := beLst[i*4 : i*4+4]
+		binary.BigEndian.PutUint32(b, ls[i])
+	}
+
+	t.Run("Iteration", func(t *testing.T) {
+		bep := newBigEndianPostings(beLst)
+		for i := 0; i < num; i++ {
+			testutil.Assert(t, bep.Next() == true, "")
+			testutil.Equals(t, uint64(ls[i]), bep.At())
+		}
+
+		testutil.Assert(t, bep.Next() == false, "")
+		testutil.Assert(t, bep.Err() == nil, "")
+	})
+
+	t.Run("Seek", func(t *testing.T) {
+		table := []struct {
+			seek  uint32
+			val   uint32
+			found bool
+		}{
+			{
+				ls[0] - 1, ls[0], true,
+			},
+			{
+				ls[4], ls[4], true,
+			},
+			{
+				ls[500] - 1, ls[500], true,
+			},
+			{
+				ls[600] + 1, ls[601], true,
+			},
+			{
+				ls[600] + 1, ls[601], true,
+			},
+			{
+				ls[600] + 1, ls[601], true,
+			},
+			{
+				ls[0], ls[601], true,
+			},
+			{
+				ls[600], ls[601], true,
+			},
+			{
+				ls[999], ls[999], true,
+			},
+			{
+				ls[999] + 10, ls[999], false,
+			},
+		}
+
+		bep := newBigEndianPostings(beLst)
+
+		for _, v := range table {
+			testutil.Equals(t, v.found, bep.Seek(uint64(v.seek)))
+			testutil.Equals(t, uint64(v.val), bep.At())
+			testutil.Assert(t, bep.Err() == nil, "")
+		}
+	})
+}
+
+func TestIntersectWithMerge(t *testing.T) {
+	// One of the reproducible cases for:
+	// https://github.com/prometheus/prometheus/issues/2616
+	a := newListPostings(21, 22, 23, 24, 25, 30)
+
+	b := Merge(
+		newListPostings(10, 20, 30),
+		newListPostings(15, 26, 30),
+	)
+
+	p := Intersect(a, b)
+	res, err := ExpandPostings(p)
+
+	testutil.Ok(t, err)
+	testutil.Equals(t, []uint64{30}, res)
+}
+
+func TestWithoutPostings(t *testing.T) {
+	var cases = []struct {
+		base Postings
+		drop Postings
+
+		res Postings
+	}{
+		{
+			base: EmptyPostings(),
+			drop: EmptyPostings(),
+
+			res: EmptyPostings(),
+		},
+		{
+			base: EmptyPostings(),
+			drop: newListPostings(1, 2),
+
+			res: EmptyPostings(),
+		},
+		{
+			base: newListPostings(1, 2),
+			drop: EmptyPostings(),
+
+			res: newListPostings(1, 2),
+		},
+		{
+			base: newListPostings(),
+			drop: newListPostings(),
+
+			res: newListPostings(),
+		},
+		{
+			base: newListPostings(1, 2, 3),
+			drop: newListPostings(),
+
+			res: newListPostings(1, 2, 3),
+		},
+		{
+			base: newListPostings(1, 2, 3),
+			drop: newListPostings(4, 5, 6),
+
+			res: newListPostings(1, 2, 3),
+		},
+		{
+			base: newListPostings(1, 2, 3),
+			drop: newListPostings(3, 4, 5),
+
+			res: newListPostings(1, 2),
+		},
+	}
+
+	for _, c := range cases {
+		t.Run("", func(t *testing.T) {
+			if c.res == nil {
+				t.Fatal("without result expectancy cannot be nil")
+			}
+
+			expected, err := ExpandPostings(c.res)
+			testutil.Ok(t, err)
+
+			w := Without(c.base, c.drop)
+
+			if c.res == EmptyPostings() {
+				testutil.Equals(t, EmptyPostings(), w)
+				return
+			}
+
+			if w == EmptyPostings() {
+				t.Fatal("without unexpected result: EmptyPostings sentinel")
+			}
+
+			res, err := ExpandPostings(w)
+			testutil.Ok(t, err)
+			testutil.Equals(t, expected, res)
+		})
+	}
+}
--- a/tsdb/labels/labels.go
+++ b/tsdb/labels/labels.go
@ -0,0 +1,233 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package labels
+
+import (
+	"bufio"
+	"bytes"
+	"os"
+	"sort"
+	"strconv"
+	"strings"
+
+	"github.com/cespare/xxhash"
+	"github.com/pkg/errors"
+)
+
+const sep = '\xff'
+
+// Label is a key/value pair of strings.
+type Label struct {
+	Name, Value string
+}
+
+// Labels is a sorted set of labels. Order has to be guaranteed upon
+// instantiation.
+type Labels []Label
+
+func (ls Labels) Len() int           { return len(ls) }
+func (ls Labels) Swap(i, j int)      { ls[i], ls[j] = ls[j], ls[i] }
+func (ls Labels) Less(i, j int) bool { return ls[i].Name < ls[j].Name }
+
+func (ls Labels) String() string {
+	var b bytes.Buffer
+
+	b.WriteByte('{')
+	for i, l := range ls {
+		if i > 0 {
+			b.WriteByte(',')
+		}
+		b.WriteString(l.Name)
+		b.WriteByte('=')
+		b.WriteString(strconv.Quote(l.Value))
+	}
+	b.WriteByte('}')
+
+	return b.String()
+}
+
+// Hash returns a hash value for the label set.
+func (ls Labels) Hash() uint64 {
+	b := make([]byte, 0, 1024)
+
+	for _, v := range ls {
+		b = append(b, v.Name...)
+		b = append(b, sep)
+		b = append(b, v.Value...)
+		b = append(b, sep)
+	}
+	return xxhash.Sum64(b)
+}
+
+// Get returns the value for the label with the given name.
+// Returns an empty string if the label doesn't exist.
+func (ls Labels) Get(name string) string {
+	for _, l := range ls {
+		if l.Name == name {
+			return l.Value
+		}
+	}
+	return ""
+}
+
+// Equals returns whether the two label sets are equal.
+func (ls Labels) Equals(o Labels) bool {
+	if len(ls) != len(o) {
+		return false
+	}
+	for i, l := range ls {
+		if o[i] != l {
+			return false
+		}
+	}
+	return true
+}
+
+// Map returns a string map of the labels.
+func (ls Labels) Map() map[string]string {
+	m := make(map[string]string, len(ls))
+	for _, l := range ls {
+		m[l.Name] = l.Value
+	}
+	return m
+}
+
+// WithoutEmpty returns the labelset without empty labels.
+// May return the same labelset.
+func (ls Labels) WithoutEmpty() Labels {
+	for _, v := range ls {
+		if v.Value == "" {
+			els := make(Labels, 0, len(ls)-1)
+			for _, v := range ls {
+				if v.Value != "" {
+					els = append(els, v)
+				}
+			}
+			return els
+		}
+	}
+	return ls
+}
+
+// New returns a sorted Labels from the given labels.
+// The caller has to guarantee that all label names are unique.
+func New(ls ...Label) Labels {
+	set := make(Labels, 0, len(ls))
+	for _, l := range ls {
+		set = append(set, l)
+	}
+	sort.Sort(set)
+
+	return set
+}
+
+// FromMap returns new sorted Labels from the given map.
+func FromMap(m map[string]string) Labels {
+	l := make(Labels, 0, len(m))
+	for k, v := range m {
+		if v != "" {
+			l = append(l, Label{Name: k, Value: v})
+		}
+	}
+	sort.Sort(l)
+
+	return l
+}
+
+// FromStrings creates new labels from pairs of strings.
+func FromStrings(ss ...string) Labels {
+	if len(ss)%2 != 0 {
+		panic("invalid number of strings")
+	}
+	var res Labels
+	for i := 0; i < len(ss); i += 2 {
+		if ss[i+1] != "" {
+			res = append(res, Label{Name: ss[i], Value: ss[i+1]})
+		}
+	}
+
+	sort.Sort(res)
+	return res
+}
+
+// Compare compares the two label sets.
+// The result will be 0 if a==b, <0 if a < b, and >0 if a > b.
+func Compare(a, b Labels) int {
+	l := len(a)
+	if len(b) < l {
+		l = len(b)
+	}
+
+	for i := 0; i < l; i++ {
+		if d := strings.Compare(a[i].Name, b[i].Name); d != 0 {
+			return d
+		}
+		if d := strings.Compare(a[i].Value, b[i].Value); d != 0 {
+			return d
+		}
+	}
+	// If all labels so far were in common, the set with fewer labels comes first.
+	return len(a) - len(b)
+}
+
+// Slice is a sortable slice of label sets.
+type Slice []Labels
+
+func (s Slice) Len() int           { return len(s) }
+func (s Slice) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+func (s Slice) Less(i, j int) bool { return Compare(s[i], s[j]) < 0 }
+
+// ReadLabels reads up to n label sets in a JSON formatted file fn. It is mostly useful
+// to load testing data.
+func ReadLabels(fn string, n int) ([]Labels, error) {
+	f, err := os.Open(fn)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	scanner := bufio.NewScanner(f)
+
+	var mets []Labels
+	hashes := map[uint64]struct{}{}
+	i := 0
+
+	for scanner.Scan() && i < n {
+		m := make(Labels, 0, 10)
+
+		r := strings.NewReplacer("\"", "", "{", "", "}", "")
+		s := r.Replace(scanner.Text())
+
+		labelChunks := strings.Split(s, ",")
+		for _, labelChunk := range labelChunks {
+			split := strings.Split(labelChunk, ":")
+			m = append(m, Label{Name: split[0], Value: split[1]})
+		}
+		// Order of the k/v labels matters, don't assume we'll always receive them already sorted.
+		sort.Sort(m)
+
+		h := m.Hash()
+		if _, ok := hashes[h]; ok {
+			continue
+		}
+		mets = append(mets, m)
+		hashes[h] = struct{}{}
+		i++
+	}
+
+	if i != n {
+		return mets, errors.Errorf("requested %d metrics but found %d", n, i)
+	}
+	return mets, nil
+}
--- a/tsdb/labels/labels_test.go
+++ b/tsdb/labels/labels_test.go
@ -0,0 +1,199 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package labels
+
+import (
+	"fmt"
+	"math/rand"
+	"path/filepath"
+	"sort"
+	"testing"
+
+	"github.com/prometheus/tsdb/testutil"
+)
+
+func TestCompareAndEquals(t *testing.T) {
+	cases := []struct {
+		a, b []Label
+		res  int
+	}{
+		{
+			a:   []Label{},
+			b:   []Label{},
+			res: 0,
+		},
+		{
+			a:   []Label{{"a", ""}},
+			b:   []Label{{"a", ""}, {"b", ""}},
+			res: -1,
+		},
+		{
+			a:   []Label{{"a", ""}},
+			b:   []Label{{"a", ""}},
+			res: 0,
+		},
+		{
+			a:   []Label{{"aa", ""}, {"aa", ""}},
+			b:   []Label{{"aa", ""}, {"ab", ""}},
+			res: -1,
+		},
+		{
+			a:   []Label{{"aa", ""}, {"abb", ""}},
+			b:   []Label{{"aa", ""}, {"ab", ""}},
+			res: 1,
+		},
+		{
+			a: []Label{
+				{"__name__", "go_gc_duration_seconds"},
+				{"job", "prometheus"},
+				{"quantile", "0.75"},
+			},
+			b: []Label{
+				{"__name__", "go_gc_duration_seconds"},
+				{"job", "prometheus"},
+				{"quantile", "1"},
+			},
+			res: -1,
+		},
+		{
+			a: []Label{
+				{"handler", "prometheus"},
+				{"instance", "localhost:9090"},
+			},
+			b: []Label{
+				{"handler", "query"},
+				{"instance", "localhost:9090"},
+			},
+			res: -1,
+		},
+	}
+	for _, c := range cases {
+		// Use constructor to ensure sortedness.
+		a, b := New(c.a...), New(c.b...)
+
+		testutil.Equals(t, c.res, Compare(a, b))
+		testutil.Equals(t, c.res == 0, a.Equals(b))
+	}
+}
+
+func BenchmarkSliceSort(b *testing.B) {
+	lbls, err := ReadLabels(filepath.Join("..", "testdata", "20kseries.json"), 20000)
+	testutil.Ok(b, err)
+
+	for len(lbls) < 20e6 {
+		lbls = append(lbls, lbls...)
+	}
+	for i := range lbls {
+		j := rand.Intn(i + 1)
+		lbls[i], lbls[j] = lbls[j], lbls[i]
+	}
+
+	for _, k := range []int{
+		100, 5000, 50000, 300000, 900000, 5e6, 20e6,
+	} {
+		b.Run(fmt.Sprintf("%d", k), func(b *testing.B) {
+			b.ReportAllocs()
+
+			for a := 0; a < b.N; a++ {
+				b.StopTimer()
+				cl := make(Slice, k)
+				copy(cl, Slice(lbls[:k]))
+				b.StartTimer()
+
+				sort.Sort(cl)
+			}
+		})
+	}
+}
+
+func BenchmarkLabelSetFromMap(b *testing.B) {
+	m := map[string]string{
+		"job":       "node",
+		"instance":  "123.123.1.211:9090",
+		"path":      "/api/v1/namespaces/<namespace>/deployments/<name>",
+		"method":    "GET",
+		"namespace": "system",
+		"status":    "500",
+	}
+	var ls Labels
+	b.ReportAllocs()
+
+	for i := 0; i < b.N; i++ {
+		ls = FromMap(m)
+	}
+	_ = ls
+}
+
+func BenchmarkMapFromLabels(b *testing.B) {
+	m := map[string]string{
+		"job":       "node",
+		"instance":  "123.123.1.211:9090",
+		"path":      "/api/v1/namespaces/<namespace>/deployments/<name>",
+		"method":    "GET",
+		"namespace": "system",
+		"status":    "500",
+	}
+	ls := FromMap(m)
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	for i := 0; i < b.N; i++ {
+		_ = ls.Map()
+	}
+}
+
+func BenchmarkLabelSetEquals(b *testing.B) {
+	// The vast majority of comparisons will be against a matching label set.
+	m := map[string]string{
+		"job":       "node",
+		"instance":  "123.123.1.211:9090",
+		"path":      "/api/v1/namespaces/<namespace>/deployments/<name>",
+		"method":    "GET",
+		"namespace": "system",
+		"status":    "500",
+	}
+	ls := FromMap(m)
+	var res bool
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	for i := 0; i < b.N; i++ {
+		res = ls.Equals(ls)
+	}
+	_ = res
+}
+
+func BenchmarkLabelSetHash(b *testing.B) {
+	// The vast majority of comparisons will be against a matching label set.
+	m := map[string]string{
+		"job":       "node",
+		"instance":  "123.123.1.211:9090",
+		"path":      "/api/v1/namespaces/<namespace>/deployments/<name>",
+		"method":    "GET",
+		"namespace": "system",
+		"status":    "500",
+	}
+	ls := FromMap(m)
+	var res uint64
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	for i := 0; i < b.N; i++ {
+		res += ls.Hash()
+	}
+	fmt.Println(res)
+}
--- a/tsdb/labels/selector.go
+++ b/tsdb/labels/selector.go
@ -0,0 +1,109 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package labels
+
+import (
+	"fmt"
+	"regexp"
+)
+
+// Selector holds constraints for matching against a label set.
+type Selector []Matcher
+
+// Matches returns whether the labels satisfy all matchers.
+func (s Selector) Matches(labels Labels) bool {
+	for _, m := range s {
+		if v := labels.Get(m.Name()); !m.Matches(v) {
+			return false
+		}
+	}
+	return true
+}
+
+// Matcher specifies a constraint for the value of a label.
+type Matcher interface {
+	// Name returns the label name the matcher should apply to.
+	Name() string
+	// Matches checks whether a value fulfills the constraints.
+	Matches(v string) bool
+	// String returns a human readable matcher.
+	String() string
+}
+
+// EqualMatcher matches on equality.
+type EqualMatcher struct {
+	name, value string
+}
+
+// Name implements Matcher interface.
+func (m EqualMatcher) Name() string { return m.name }
+
+// Matches implements Matcher interface.
+func (m EqualMatcher) Matches(v string) bool { return v == m.value }
+
+// String implements Matcher interface.
+func (m EqualMatcher) String() string { return fmt.Sprintf("%s=%q", m.name, m.value) }
+
+// Value returns the matched value.
+func (m EqualMatcher) Value() string { return m.value }
+
+// NewEqualMatcher returns a new matcher matching an exact label value.
+func NewEqualMatcher(name, value string) Matcher {
+	return &EqualMatcher{name: name, value: value}
+}
+
+type RegexpMatcher struct {
+	name string
+	re   *regexp.Regexp
+}
+
+func (m RegexpMatcher) Name() string          { return m.name }
+func (m RegexpMatcher) Matches(v string) bool { return m.re.MatchString(v) }
+func (m RegexpMatcher) String() string        { return fmt.Sprintf("%s=~%q", m.name, m.re.String()) }
+func (m RegexpMatcher) Value() string         { return m.re.String() }
+
+// NewRegexpMatcher returns a new matcher verifying that a value matches
+// the regular expression pattern.
+func NewRegexpMatcher(name, pattern string) (Matcher, error) {
+	re, err := regexp.Compile(pattern)
+	if err != nil {
+		return nil, err
+	}
+	return &RegexpMatcher{name: name, re: re}, nil
+}
+
+// NewMustRegexpMatcher returns a new matcher verifying that a value matches
+// the regular expression pattern. Will panic if the pattern is not a valid
+// regular expression.
+func NewMustRegexpMatcher(name, pattern string) Matcher {
+	re, err := regexp.Compile(pattern)
+	if err != nil {
+		panic(err)
+	}
+	return &RegexpMatcher{name: name, re: re}
+
+}
+
+// NotMatcher inverts the matching result for a matcher.
+type NotMatcher struct {
+	Matcher
+}
+
+func (m NotMatcher) Matches(v string) bool { return !m.Matcher.Matches(v) }
+func (m NotMatcher) String() string        { return fmt.Sprintf("not(%s)", m.Matcher.String()) }
+
+// Not inverts the matcher's matching result.
+func Not(m Matcher) Matcher {
+	return &NotMatcher{m}
+}
--- a/tsdb/mocks_test.go
+++ b/tsdb/mocks_test.go
@ -0,0 +1,78 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"github.com/prometheus/tsdb/chunkenc"
+	"github.com/prometheus/tsdb/chunks"
+	"github.com/prometheus/tsdb/index"
+	"github.com/prometheus/tsdb/labels"
+)
+
+type mockIndexWriter struct {
+	series []seriesSamples
+}
+
+func (mockIndexWriter) AddSymbols(sym map[string]struct{}) error { return nil }
+func (m *mockIndexWriter) AddSeries(ref uint64, l labels.Labels, chunks ...chunks.Meta) error {
+	i := -1
+	for j, s := range m.series {
+		if !labels.FromMap(s.lset).Equals(l) {
+			continue
+		}
+		i = j
+		break
+	}
+	if i == -1 {
+		m.series = append(m.series, seriesSamples{
+			lset: l.Map(),
+		})
+		i = len(m.series) - 1
+	}
+
+	var iter chunkenc.Iterator
+	for _, chk := range chunks {
+		samples := make([]sample, 0, chk.Chunk.NumSamples())
+
+		iter = chk.Chunk.Iterator(iter)
+		for iter.Next() {
+			s := sample{}
+			s.t, s.v = iter.At()
+
+			samples = append(samples, s)
+		}
+		if err := iter.Err(); err != nil {
+			return err
+		}
+
+		m.series[i].chunks = append(m.series[i].chunks, samples)
+	}
+	return nil
+}
+
+func (mockIndexWriter) WriteLabelIndex(names []string, values []string) error     { return nil }
+func (mockIndexWriter) WritePostings(name, value string, it index.Postings) error { return nil }
+func (mockIndexWriter) Close() error                                              { return nil }
+
+type mockBReader struct {
+	ir   IndexReader
+	cr   ChunkReader
+	mint int64
+	maxt int64
+}
+
+func (r *mockBReader) Index() (IndexReader, error)          { return r.ir, nil }
+func (r *mockBReader) Chunks() (ChunkReader, error)         { return r.cr, nil }
+func (r *mockBReader) Tombstones() (TombstoneReader, error) { return newMemTombstones(), nil }
+func (r *mockBReader) Meta() BlockMeta                      { return BlockMeta{MinTime: r.mint, MaxTime: r.maxt} }
--- a/tsdb/querier.go
+++ b/tsdb/querier.go
--- a/tsdb/querier_test.go
+++ b/tsdb/querier_test.go
--- a/tsdb/record.go
+++ b/tsdb/record.go
@ -0,0 +1,208 @@
+// Copyright 2018 The Prometheus Authors
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"math"
+	"sort"
+
+	"github.com/pkg/errors"
+	"github.com/prometheus/tsdb/encoding"
+	"github.com/prometheus/tsdb/labels"
+)
+
+// RecordType represents the data type of a record.
+type RecordType uint8
+
+const (
+	// RecordInvalid is returned for unrecognised WAL record types.
+	RecordInvalid RecordType = 255
+	// RecordSeries is used to match WAL records of type Series.
+	RecordSeries RecordType = 1
+	// RecordSamples is used to match WAL records of type Samples.
+	RecordSamples RecordType = 2
+	// RecordTombstones is used to match WAL records of type Tombstones.
+	RecordTombstones RecordType = 3
+)
+
+// RecordDecoder decodes series, sample, and tombstone records.
+// The zero value is ready to use.
+type RecordDecoder struct {
+}
+
+// Type returns the type of the record.
+// Return RecordInvalid if no valid record type is found.
+func (d *RecordDecoder) Type(rec []byte) RecordType {
+	if len(rec) < 1 {
+		return RecordInvalid
+	}
+	switch t := RecordType(rec[0]); t {
+	case RecordSeries, RecordSamples, RecordTombstones:
+		return t
+	}
+	return RecordInvalid
+}
+
+// Series appends series in rec to the given slice.
+func (d *RecordDecoder) Series(rec []byte, series []RefSeries) ([]RefSeries, error) {
+	dec := encoding.Decbuf{B: rec}
+
+	if RecordType(dec.Byte()) != RecordSeries {
+		return nil, errors.New("invalid record type")
+	}
+	for len(dec.B) > 0 && dec.Err() == nil {
+		ref := dec.Be64()
+
+		lset := make(labels.Labels, dec.Uvarint())
+
+		for i := range lset {
+			lset[i].Name = dec.UvarintStr()
+			lset[i].Value = dec.UvarintStr()
+		}
+		sort.Sort(lset)
+
+		series = append(series, RefSeries{
+			Ref:    ref,
+			Labels: lset,
+		})
+	}
+	if dec.Err() != nil {
+		return nil, dec.Err()
+	}
+	if len(dec.B) > 0 {
+		return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.B))
+	}
+	return series, nil
+}
+
+// Samples appends samples in rec to the given slice.
+func (d *RecordDecoder) Samples(rec []byte, samples []RefSample) ([]RefSample, error) {
+	dec := encoding.Decbuf{B: rec}
+
+	if RecordType(dec.Byte()) != RecordSamples {
+		return nil, errors.New("invalid record type")
+	}
+	if dec.Len() == 0 {
+		return samples, nil
+	}
+	var (
+		baseRef  = dec.Be64()
+		baseTime = dec.Be64int64()
+	)
+	for len(dec.B) > 0 && dec.Err() == nil {
+		dref := dec.Varint64()
+		dtime := dec.Varint64()
+		val := dec.Be64()
+
+		samples = append(samples, RefSample{
+			Ref: uint64(int64(baseRef) + dref),
+			T:   baseTime + dtime,
+			V:   math.Float64frombits(val),
+		})
+	}
+
+	if dec.Err() != nil {
+		return nil, errors.Wrapf(dec.Err(), "decode error after %d samples", len(samples))
+	}
+	if len(dec.B) > 0 {
+		return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.B))
+	}
+	return samples, nil
+}
+
+// Tombstones appends tombstones in rec to the given slice.
+func (d *RecordDecoder) Tombstones(rec []byte, tstones []Stone) ([]Stone, error) {
+	dec := encoding.Decbuf{B: rec}
+
+	if RecordType(dec.Byte()) != RecordTombstones {
+		return nil, errors.New("invalid record type")
+	}
+	for dec.Len() > 0 && dec.Err() == nil {
+		tstones = append(tstones, Stone{
+			ref: dec.Be64(),
+			intervals: Intervals{
+				{Mint: dec.Varint64(), Maxt: dec.Varint64()},
+			},
+		})
+	}
+	if dec.Err() != nil {
+		return nil, dec.Err()
+	}
+	if len(dec.B) > 0 {
+		return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.B))
+	}
+	return tstones, nil
+}
+
+// RecordEncoder encodes series, sample, and tombstones records.
+// The zero value is ready to use.
+type RecordEncoder struct {
+}
+
+// Series appends the encoded series to b and returns the resulting slice.
+func (e *RecordEncoder) Series(series []RefSeries, b []byte) []byte {
+	buf := encoding.Encbuf{B: b}
+	buf.PutByte(byte(RecordSeries))
+
+	for _, s := range series {
+		buf.PutBE64(s.Ref)
+		buf.PutUvarint(len(s.Labels))
+
+		for _, l := range s.Labels {
+			buf.PutUvarintStr(l.Name)
+			buf.PutUvarintStr(l.Value)
+		}
+	}
+	return buf.Get()
+}
+
+// Samples appends the encoded samples to b and returns the resulting slice.
+func (e *RecordEncoder) Samples(samples []RefSample, b []byte) []byte {
+	buf := encoding.Encbuf{B: b}
+	buf.PutByte(byte(RecordSamples))
+
+	if len(samples) == 0 {
+		return buf.Get()
+	}
+
+	// Store base timestamp and base reference number of first sample.
+	// All samples encode their timestamp and ref as delta to those.
+	first := samples[0]
+
+	buf.PutBE64(first.Ref)
+	buf.PutBE64int64(first.T)
+
+	for _, s := range samples {
+		buf.PutVarint64(int64(s.Ref) - int64(first.Ref))
+		buf.PutVarint64(s.T - first.T)
+		buf.PutBE64(math.Float64bits(s.V))
+	}
+	return buf.Get()
+}
+
+// Tombstones appends the encoded tombstones to b and returns the resulting slice.
+func (e *RecordEncoder) Tombstones(tstones []Stone, b []byte) []byte {
+	buf := encoding.Encbuf{B: b}
+	buf.PutByte(byte(RecordTombstones))
+
+	for _, s := range tstones {
+		for _, iv := range s.intervals {
+			buf.PutBE64(s.ref)
+			buf.PutVarint64(iv.Mint)
+			buf.PutVarint64(iv.Maxt)
+		}
+	}
+	return buf.Get()
+}
--- a/tsdb/record_test.go
+++ b/tsdb/record_test.go
@ -0,0 +1,118 @@
+// Copyright 2018 The Prometheus Authors
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"testing"
+
+	"github.com/pkg/errors"
+	"github.com/prometheus/tsdb/encoding"
+	"github.com/prometheus/tsdb/labels"
+	"github.com/prometheus/tsdb/testutil"
+)
+
+func TestRecord_EncodeDecode(t *testing.T) {
+	var enc RecordEncoder
+	var dec RecordDecoder
+
+	series := []RefSeries{
+		{
+			Ref:    100,
+			Labels: labels.FromStrings("abc", "def", "123", "456"),
+		}, {
+			Ref:    1,
+			Labels: labels.FromStrings("abc", "def2", "1234", "4567"),
+		}, {
+			Ref:    435245,
+			Labels: labels.FromStrings("xyz", "def", "foo", "bar"),
+		},
+	}
+	decSeries, err := dec.Series(enc.Series(series, nil), nil)
+	testutil.Ok(t, err)
+	testutil.Equals(t, series, decSeries)
+
+	samples := []RefSample{
+		{Ref: 0, T: 12423423, V: 1.2345},
+		{Ref: 123, T: -1231, V: -123},
+		{Ref: 2, T: 0, V: 99999},
+	}
+	decSamples, err := dec.Samples(enc.Samples(samples, nil), nil)
+	testutil.Ok(t, err)
+	testutil.Equals(t, samples, decSamples)
+
+	// Intervals get split up into single entries. So we don't get back exactly
+	// what we put in.
+	tstones := []Stone{
+		{ref: 123, intervals: Intervals{
+			{Mint: -1000, Maxt: 1231231},
+			{Mint: 5000, Maxt: 0},
+		}},
+		{ref: 13, intervals: Intervals{
+			{Mint: -1000, Maxt: -11},
+			{Mint: 5000, Maxt: 1000},
+		}},
+	}
+	decTstones, err := dec.Tombstones(enc.Tombstones(tstones, nil), nil)
+	testutil.Ok(t, err)
+	testutil.Equals(t, []Stone{
+		{ref: 123, intervals: Intervals{{Mint: -1000, Maxt: 1231231}}},
+		{ref: 123, intervals: Intervals{{Mint: 5000, Maxt: 0}}},
+		{ref: 13, intervals: Intervals{{Mint: -1000, Maxt: -11}}},
+		{ref: 13, intervals: Intervals{{Mint: 5000, Maxt: 1000}}},
+	}, decTstones)
+}
+
+// TestRecord_Corruputed ensures that corrupted records return the correct error.
+// Bugfix check for pull/521 and pull/523.
+func TestRecord_Corruputed(t *testing.T) {
+	var enc RecordEncoder
+	var dec RecordDecoder
+
+	t.Run("Test corrupted series record", func(t *testing.T) {
+		series := []RefSeries{
+			{
+				Ref:    100,
+				Labels: labels.FromStrings("abc", "def", "123", "456"),
+			},
+		}
+
+		corrupted := enc.Series(series, nil)[:8]
+		_, err := dec.Series(corrupted, nil)
+		testutil.Equals(t, err, encoding.ErrInvalidSize)
+	})
+
+	t.Run("Test corrupted sample record", func(t *testing.T) {
+		samples := []RefSample{
+			{Ref: 0, T: 12423423, V: 1.2345},
+		}
+
+		corrupted := enc.Samples(samples, nil)[:8]
+		_, err := dec.Samples(corrupted, nil)
+		testutil.Equals(t, errors.Cause(err), encoding.ErrInvalidSize)
+	})
+
+	t.Run("Test corrupted tombstone record", func(t *testing.T) {
+		tstones := []Stone{
+			{ref: 123, intervals: Intervals{
+				{Mint: -1000, Maxt: 1231231},
+				{Mint: 5000, Maxt: 0},
+			}},
+		}
+
+		corrupted := enc.Tombstones(tstones, nil)[:8]
+		_, err := dec.Tombstones(corrupted, nil)
+		testutil.Equals(t, err, encoding.ErrInvalidSize)
+	})
+}
--- a/tsdb/repair.go
+++ b/tsdb/repair.go
@ -0,0 +1,133 @@
+// Copyright 2018 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"encoding/json"
+	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+
+	"github.com/go-kit/kit/log"
+	"github.com/go-kit/kit/log/level"
+	"github.com/pkg/errors"
+	tsdb_errors "github.com/prometheus/tsdb/errors"
+	"github.com/prometheus/tsdb/fileutil"
+)
+
+// repairBadIndexVersion repairs an issue in index and meta.json persistence introduced in
+// commit 129773b41a565fde5156301e37f9a87158030443.
+func repairBadIndexVersion(logger log.Logger, dir string) error {
+	// All blocks written by Prometheus 2.1 with a meta.json version of 2 are affected.
+	// We must actually set the index file version to 2 and revert the meta.json version back to 1.
+	dirs, err := blockDirs(dir)
+	if err != nil {
+		return errors.Wrapf(err, "list block dirs in %q", dir)
+	}
+
+	wrapErr := func(err error, d string) error {
+		return errors.Wrapf(err, "block dir: %q", d)
+	}
+
+	tmpFiles := make([]string, 0, len(dir))
+	defer func() {
+		for _, tmp := range tmpFiles {
+			if err := os.RemoveAll(tmp); err != nil {
+				level.Error(logger).Log("msg", "remove tmp file", "err", err.Error())
+			}
+		}
+	}()
+
+	for _, d := range dirs {
+		meta, err := readBogusMetaFile(d)
+		if err != nil {
+			return wrapErr(err, d)
+		}
+		if meta.Version == 1 {
+			level.Info(logger).Log(
+				"msg", "found healthy block",
+				"mint", meta.MinTime,
+				"maxt", meta.MaxTime,
+				"ulid", meta.ULID,
+			)
+			continue
+		}
+		level.Info(logger).Log(
+			"msg", "fixing broken block",
+			"mint", meta.MinTime,
+			"maxt", meta.MaxTime,
+			"ulid", meta.ULID,
+		)
+
+		repl, err := os.Create(filepath.Join(d, "index.repaired"))
+		if err != nil {
+			return wrapErr(err, d)
+		}
+		tmpFiles = append(tmpFiles, repl.Name())
+
+		broken, err := os.Open(filepath.Join(d, indexFilename))
+		if err != nil {
+			return wrapErr(err, d)
+		}
+		if _, err := io.Copy(repl, broken); err != nil {
+			return wrapErr(err, d)
+		}
+
+		var merr tsdb_errors.MultiError
+
+		// Set the 5th byte to 2 to indicate the correct file format version.
+		if _, err := repl.WriteAt([]byte{2}, 4); err != nil {
+			merr.Add(wrapErr(err, d))
+			merr.Add(wrapErr(repl.Close(), d))
+			return merr.Err()
+		}
+		if err := repl.Sync(); err != nil {
+			merr.Add(wrapErr(err, d))
+			merr.Add(wrapErr(repl.Close(), d))
+			return merr.Err()
+		}
+		if err := repl.Close(); err != nil {
+			return wrapErr(err, d)
+		}
+		if err := broken.Close(); err != nil {
+			return wrapErr(err, d)
+		}
+		if err := fileutil.Replace(repl.Name(), broken.Name()); err != nil {
+			return wrapErr(err, d)
+		}
+		// Reset version of meta.json to 1.
+		meta.Version = 1
+		if _, err := writeMetaFile(logger, d, meta); err != nil {
+			return wrapErr(err, d)
+		}
+	}
+	return nil
+}
+
+func readBogusMetaFile(dir string) (*BlockMeta, error) {
+	b, err := ioutil.ReadFile(filepath.Join(dir, metaFilename))
+	if err != nil {
+		return nil, err
+	}
+	var m BlockMeta
+
+	if err := json.Unmarshal(b, &m); err != nil {
+		return nil, err
+	}
+	if m.Version != 1 && m.Version != 2 {
+		return nil, errors.Errorf("unexpected meta file version %d", m.Version)
+	}
+	return &m, nil
+}
--- a/tsdb/repair_test.go
+++ b/tsdb/repair_test.go
@ -0,0 +1,127 @@
+// Copyright 2018 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/prometheus/tsdb/chunks"
+	"github.com/prometheus/tsdb/fileutil"
+	"github.com/prometheus/tsdb/index"
+	"github.com/prometheus/tsdb/labels"
+	"github.com/prometheus/tsdb/testutil"
+)
+
+func TestRepairBadIndexVersion(t *testing.T) {
+	// The broken index used in this test was written by the following script
+	// at a broken revision.
+	//
+	// func main() {
+	// 	w, err := index.NewWriter(indexFilename)
+	// 	if err != nil {
+	// 		panic(err)
+	// 	}
+	// 	err = w.AddSymbols(map[string]struct{}{
+	// 		"a": struct{}{},
+	// 		"b": struct{}{},
+	// 		"1": struct{}{},
+	// 		"2": struct{}{},
+	// 	})
+	// 	if err != nil {
+	// 		panic(err)
+	// 	}
+	// 	err = w.AddSeries(1, labels.FromStrings("a", "1", "b", "1"))
+	// 	if err != nil {
+	// 		panic(err)
+	// 	}
+	// 	err = w.AddSeries(2, labels.FromStrings("a", "2", "b", "1"))
+	// 	if err != nil {
+	// 		panic(err)
+	// 	}
+	// 	err = w.WritePostings("b", "1", index.NewListPostings([]uint64{1, 2}))
+	// 	if err != nil {
+	// 		panic(err)
+	// 	}
+	// 	if err := w.Close(); err != nil {
+	// 		panic(err)
+	// 	}
+	// }
+	dbDir := filepath.Join("testdata", "repair_index_version", "01BZJ9WJQPWHGNC2W4J9TA62KC")
+	tmpDir := filepath.Join("testdata", "repair_index_version", "copy")
+	tmpDbDir := filepath.Join(tmpDir, "3MCNSQ8S31EHGJYWK5E1GPJWJZ")
+
+	// Check the current db.
+	// In its current state, lookups should fail with the fixed code.
+	_, _, err := readMetaFile(dbDir)
+	testutil.NotOk(t, err)
+
+	// Touch chunks dir in block.
+	testutil.Ok(t, os.MkdirAll(filepath.Join(dbDir, "chunks"), 0777))
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(filepath.Join(dbDir, "chunks")))
+	}()
+
+	r, err := index.NewFileReader(filepath.Join(dbDir, indexFilename))
+	testutil.Ok(t, err)
+	p, err := r.Postings("b", "1")
+	testutil.Ok(t, err)
+	for p.Next() {
+		t.Logf("next ID %d", p.At())
+
+		var lset labels.Labels
+		testutil.NotOk(t, r.Series(p.At(), &lset, nil))
+	}
+	testutil.Ok(t, p.Err())
+	testutil.Ok(t, r.Close())
+
+	// Create a copy DB to run test against.
+	if err = fileutil.CopyDirs(dbDir, tmpDbDir); err != nil {
+		t.Fatal(err)
+	}
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(tmpDir))
+	}()
+	// On DB opening all blocks in the base dir should be repaired.
+	db, err := Open(tmpDir, nil, nil, nil)
+	testutil.Ok(t, err)
+	db.Close()
+
+	r, err = index.NewFileReader(filepath.Join(tmpDbDir, indexFilename))
+	testutil.Ok(t, err)
+	defer r.Close()
+	p, err = r.Postings("b", "1")
+	testutil.Ok(t, err)
+	res := []labels.Labels{}
+
+	for p.Next() {
+		t.Logf("next ID %d", p.At())
+
+		var lset labels.Labels
+		var chks []chunks.Meta
+		testutil.Ok(t, r.Series(p.At(), &lset, &chks))
+		res = append(res, lset)
+	}
+
+	testutil.Ok(t, p.Err())
+	testutil.Equals(t, []labels.Labels{
+		{{"a", "1"}, {"b", "1"}},
+		{{"a", "2"}, {"b", "1"}},
+	}, res)
+
+	meta, _, err := readMetaFile(tmpDbDir)
+	testutil.Ok(t, err)
+	testutil.Assert(t, meta.Version == 1, "unexpected meta version %d", meta.Version)
+}
--- a/tsdb/test/conv_test.go
+++ b/tsdb/test/conv_test.go
@ -0,0 +1,58 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package test
+
+import "testing"
+
+func BenchmarkMapConversion(b *testing.B) {
+	type key string
+	type val string
+
+	m := map[key]val{
+		"job":       "node",
+		"instance":  "123.123.1.211:9090",
+		"path":      "/api/v1/namespaces/<namespace>/deployments/<name>",
+		"method":    "GET",
+		"namespace": "system",
+		"status":    "500",
+	}
+
+	var sm map[string]string
+
+	for i := 0; i < b.N; i++ {
+		sm = make(map[string]string, len(m))
+		for k, v := range m {
+			sm[string(k)] = string(v)
+		}
+	}
+}
+
+func BenchmarkListIter(b *testing.B) {
+	var list []uint32
+	for i := 0; i < 1e4; i++ {
+		list = append(list, uint32(i))
+	}
+
+	b.ResetTimer()
+
+	total := uint32(0)
+
+	for j := 0; j < b.N; j++ {
+		sum := uint32(0)
+		for _, k := range list {
+			sum += k
+		}
+		total += sum
+	}
+}
--- a/tsdb/test/hash_test.go
+++ b/tsdb/test/hash_test.go
@ -0,0 +1,124 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package test
+
+import (
+	"crypto/rand"
+	"fmt"
+	"hash/crc32"
+	"testing"
+
+	"github.com/cespare/xxhash"
+	sip13 "github.com/dgryski/go-sip13"
+)
+
+type pair struct {
+	name, value string
+}
+
+var testInput = []pair{
+	{"job", "node"},
+	{"instance", "123.123.1.211:9090"},
+	{"path", "/api/v1/namespaces/<namespace>/deployments/<name>"},
+	{"method", "GET"},
+	{"namespace", "system"},
+	{"status", "500"},
+}
+
+func BenchmarkHash(b *testing.B) {
+	input := []byte{}
+	for _, v := range testInput {
+		input = append(input, v.name...)
+		input = append(input, '\xff')
+		input = append(input, v.value...)
+		input = append(input, '\xff')
+	}
+
+	var total uint64
+
+	var k0 uint64 = 0x0706050403020100
+	var k1 uint64 = 0x0f0e0d0c0b0a0908
+
+	for name, f := range map[string]func(b []byte) uint64{
+		"xxhash": xxhash.Sum64,
+		"fnv64":  fnv64a,
+		"sip13":  func(b []byte) uint64 { return sip13.Sum64(k0, k1, b) },
+	} {
+		b.Run(name, func(b *testing.B) {
+			b.SetBytes(int64(len(input)))
+			total = 0
+			for i := 0; i < b.N; i++ {
+				total += f(input)
+			}
+		})
+	}
+}
+
+// hashAdd adds a string to a fnv64a hash value, returning the updated hash.
+func fnv64a(b []byte) uint64 {
+	const (
+		offset64 = 14695981039346656037
+		prime64  = 1099511628211
+	)
+
+	h := uint64(offset64)
+	for x := range b {
+		h ^= uint64(x)
+		h *= prime64
+	}
+	return h
+}
+
+func BenchmarkCRC32_diff(b *testing.B) {
+
+	data := [][]byte{}
+
+	for i := 0; i < 1000; i++ {
+		b := make([]byte, 512)
+		rand.Read(b)
+		data = append(data, b)
+	}
+
+	ctab := crc32.MakeTable(crc32.Castagnoli)
+	total := uint32(0)
+
+	b.Run("direct", func(b *testing.B) {
+		b.ReportAllocs()
+
+		for i := 0; i < b.N; i++ {
+			total += crc32.Checksum(data[i%1000], ctab)
+		}
+	})
+	b.Run("hash-reuse", func(b *testing.B) {
+		b.ReportAllocs()
+		h := crc32.New(ctab)
+
+		for i := 0; i < b.N; i++ {
+			h.Reset()
+			h.Write(data[i%1000])
+			total += h.Sum32()
+		}
+	})
+	b.Run("hash-new", func(b *testing.B) {
+		b.ReportAllocs()
+
+		for i := 0; i < b.N; i++ {
+			h := crc32.New(ctab)
+			h.Write(data[i%1000])
+			total += h.Sum32()
+		}
+	})
+
+	fmt.Println(total)
+}
--- a/tsdb/test/labels_test.go
+++ b/tsdb/test/labels_test.go
@ -0,0 +1,216 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package test
+
+import (
+	"bytes"
+	"crypto/rand"
+	"testing"
+
+	"github.com/prometheus/tsdb/labels"
+)
+
+func BenchmarkMapClone(b *testing.B) {
+	m := map[string]string{
+		"job":        "node",
+		"instance":   "123.123.1.211:9090",
+		"path":       "/api/v1/namespaces/<namespace>/deployments/<name>",
+		"method":     "GET",
+		"namespace":  "system",
+		"status":     "500",
+		"prometheus": "prometheus-core-1",
+		"datacenter": "eu-west-1",
+		"pod_name":   "abcdef-99999-defee",
+	}
+
+	for i := 0; i < b.N; i++ {
+		res := make(map[string]string, len(m))
+		for k, v := range m {
+			res[k] = v
+		}
+		m = res
+	}
+}
+
+func BenchmarkLabelsClone(b *testing.B) {
+	m := map[string]string{
+		"job":        "node",
+		"instance":   "123.123.1.211:9090",
+		"path":       "/api/v1/namespaces/<namespace>/deployments/<name>",
+		"method":     "GET",
+		"namespace":  "system",
+		"status":     "500",
+		"prometheus": "prometheus-core-1",
+		"datacenter": "eu-west-1",
+		"pod_name":   "abcdef-99999-defee",
+	}
+	l := labels.FromMap(m)
+
+	for i := 0; i < b.N; i++ {
+		res := make(labels.Labels, len(l))
+		copy(res, l)
+		l = res
+	}
+}
+
+func BenchmarkLabelMapAccess(b *testing.B) {
+	m := map[string]string{
+		"job":        "node",
+		"instance":   "123.123.1.211:9090",
+		"path":       "/api/v1/namespaces/<namespace>/deployments/<name>",
+		"method":     "GET",
+		"namespace":  "system",
+		"status":     "500",
+		"prometheus": "prometheus-core-1",
+		"datacenter": "eu-west-1",
+		"pod_name":   "abcdef-99999-defee",
+	}
+
+	var v string
+
+	for k := range m {
+		b.Run(k, func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				v = m[k]
+			}
+		})
+	}
+
+	_ = v
+}
+
+func BenchmarkLabelSetAccess(b *testing.B) {
+	m := map[string]string{
+		"job":        "node",
+		"instance":   "123.123.1.211:9090",
+		"path":       "/api/v1/namespaces/<namespace>/deployments/<name>",
+		"method":     "GET",
+		"namespace":  "system",
+		"status":     "500",
+		"prometheus": "prometheus-core-1",
+		"datacenter": "eu-west-1",
+		"pod_name":   "abcdef-99999-defee",
+	}
+	ls := labels.FromMap(m)
+
+	var v string
+
+	for _, l := range ls {
+		b.Run(l.Name, func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				v = ls.Get(l.Name)
+			}
+		})
+	}
+
+	_ = v
+}
+
+func BenchmarkStringBytesEquals(b *testing.B) {
+	randBytes := func(n int) ([]byte, []byte) {
+		buf1 := make([]byte, n)
+		if _, err := rand.Read(buf1); err != nil {
+			b.Fatal(err)
+		}
+		buf2 := make([]byte, n)
+		copy(buf1, buf2)
+
+		return buf1, buf2
+	}
+
+	cases := []struct {
+		name string
+		f    func() ([]byte, []byte)
+	}{
+		{
+			name: "equal",
+			f: func() ([]byte, []byte) {
+				return randBytes(60)
+			},
+		},
+		{
+			name: "1-flip-end",
+			f: func() ([]byte, []byte) {
+				b1, b2 := randBytes(60)
+				b2[59] ^= b2[59]
+				return b1, b2
+			},
+		},
+		{
+			name: "1-flip-middle",
+			f: func() ([]byte, []byte) {
+				b1, b2 := randBytes(60)
+				b2[29] ^= b2[29]
+				return b1, b2
+			},
+		},
+		{
+			name: "1-flip-start",
+			f: func() ([]byte, []byte) {
+				b1, b2 := randBytes(60)
+				b2[0] ^= b2[0]
+				return b1, b2
+			},
+		},
+		{
+			name: "different-length",
+			f: func() ([]byte, []byte) {
+				b1, b2 := randBytes(60)
+				return b1, b2[:59]
+			},
+		},
+	}
+
+	for _, c := range cases {
+		b.Run(c.name+"-strings", func(b *testing.B) {
+			ab, bb := c.f()
+			as, bs := string(ab), string(bb)
+			b.SetBytes(int64(len(as)))
+
+			var r bool
+
+			for i := 0; i < b.N; i++ {
+				r = as == bs
+			}
+			_ = r
+		})
+
+		b.Run(c.name+"-bytes", func(b *testing.B) {
+			ab, bb := c.f()
+			b.SetBytes(int64(len(ab)))
+
+			var r bool
+
+			for i := 0; i < b.N; i++ {
+				r = bytes.Equal(ab, bb)
+			}
+			_ = r
+		})
+
+		b.Run(c.name+"-bytes-length-check", func(b *testing.B) {
+			ab, bb := c.f()
+			b.SetBytes(int64(len(ab)))
+
+			var r bool
+
+			for i := 0; i < b.N; i++ {
+				if len(ab) != len(bb) {
+					continue
+				}
+				r = bytes.Equal(ab, bb)
+			}
+			_ = r
+		})
+	}
+}
--- a/tsdb/testdata/20kseries.json
+++ b/tsdb/testdata/20kseries.json
--- a/tsdb/testdata/repair_index_version/01BZJ9WJQPWHGNC2W4J9TA62KC/index
+++ b/tsdb/testdata/repair_index_version/01BZJ9WJQPWHGNC2W4J9TA62KC/index
--- a/tsdb/testdata/repair_index_version/01BZJ9WJQPWHGNC2W4J9TA62KC/meta.json
+++ b/tsdb/testdata/repair_index_version/01BZJ9WJQPWHGNC2W4J9TA62KC/meta.json
@ -0,0 +1,17 @@
+{
+	"version": 2,
+	"ulid": "01BZJ9WJR6Z192734YNMD62F6M",
+	"minTime": 1511366400000,
+	"maxTime": 1511368200000,
+	"stats": {
+		"numSamples": 31897565,
+		"numSeries": 88910,
+		"numChunks": 266093
+	},
+	"compaction": {
+		"level": 1,
+		"sources": [
+			"01BZJ9WJR6Z192734YNMD62F6M"
+		]
+	}
+}
--- a/tsdb/testutil/directory.go
+++ b/tsdb/testutil/directory.go
@ -0,0 +1,182 @@
+// Copyright 2013 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutil
+
+import (
+	"crypto/sha256"
+	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strconv"
+	"testing"
+)
+
+const (
+	// The base directory used for test emissions, which instructs the operating
+	// system to use the default temporary directory as the base or TMPDIR
+	// environment variable.
+	defaultDirectory = ""
+
+	// NilCloser is a no-op Closer.
+	NilCloser = nilCloser(true)
+
+	// The number of times that a TemporaryDirectory will retry its removal
+	temporaryDirectoryRemoveRetries = 2
+)
+
+type (
+	// Closer is the interface that wraps the Close method.
+	Closer interface {
+		// Close reaps the underlying directory and its children. The directory
+		// could be deleted by its users already.
+		Close()
+	}
+
+	nilCloser bool
+
+	// TemporaryDirectory models a closeable path for transient POSIX disk
+	// activities.
+	TemporaryDirectory interface {
+		Closer
+
+		// Path returns the underlying path for access.
+		Path() string
+	}
+
+	// temporaryDirectory is kept as a private type due to private fields and
+	// their interactions.
+	temporaryDirectory struct {
+		path   string
+		tester T
+	}
+
+	callbackCloser struct {
+		fn func()
+	}
+
+	// T implements the needed methods of testing.TB so that we do not need
+	// to actually import testing (which has the side effect of adding all
+	// the test flags, which we do not want in non-test binaries even if
+	// they make use of these utilities for some reason).
+	T interface {
+		Fatal(args ...interface{})
+		Fatalf(format string, args ...interface{})
+	}
+)
+
+func (c nilCloser) Close() {
+}
+
+func (c callbackCloser) Close() {
+	c.fn()
+}
+
+// NewCallbackCloser returns a Closer that calls the provided function upon
+// closing.
+func NewCallbackCloser(fn func()) Closer {
+	return &callbackCloser{
+		fn: fn,
+	}
+}
+
+func (t temporaryDirectory) Close() {
+	retries := temporaryDirectoryRemoveRetries
+	err := os.RemoveAll(t.path)
+	for err != nil && retries > 0 {
+		switch {
+		case os.IsNotExist(err):
+			err = nil
+		default:
+			retries--
+			err = os.RemoveAll(t.path)
+		}
+	}
+	if err != nil {
+		t.tester.Fatal(err)
+	}
+}
+
+func (t temporaryDirectory) Path() string {
+	return t.path
+}
+
+// NewTemporaryDirectory creates a new temporary directory for transient POSIX
+// activities.
+func NewTemporaryDirectory(name string, t T) (handler TemporaryDirectory) {
+	var (
+		directory string
+		err       error
+	)
+
+	directory, err = ioutil.TempDir(defaultDirectory, name)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	handler = temporaryDirectory{
+		path:   directory,
+		tester: t,
+	}
+
+	return
+}
+
+// DirSize returns the size in bytes of all files in a directory.
+func DirSize(t *testing.T, path string) int64 {
+	var size int64
+	err := filepath.Walk(path, func(_ string, info os.FileInfo, err error) error {
+		Ok(t, err)
+		if !info.IsDir() {
+			size += info.Size()
+		}
+		return nil
+	})
+	Ok(t, err)
+	return size
+}
+
+// DirHash returns a hash of all files attribites and their content within a directory.
+func DirHash(t *testing.T, path string) []byte {
+	hash := sha256.New()
+	err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error {
+		Ok(t, err)
+
+		if info.IsDir() {
+			return nil
+		}
+		f, err := os.Open(path)
+		Ok(t, err)
+		defer f.Close()
+
+		_, err = io.Copy(hash, f)
+		Ok(t, err)
+
+		_, err = io.WriteString(hash, strconv.Itoa(int(info.Size())))
+		Ok(t, err)
+
+		_, err = io.WriteString(hash, info.Name())
+		Ok(t, err)
+
+		modTime, err := info.ModTime().GobEncode()
+		Ok(t, err)
+
+		_, err = io.WriteString(hash, string(modTime))
+		Ok(t, err)
+		return nil
+	})
+	Ok(t, err)
+
+	return hash.Sum(nil)
+}
--- a/tsdb/testutil/logging.go
+++ b/tsdb/testutil/logging.go
@ -0,0 +1,35 @@
+// Copyright 2019 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutil
+
+import (
+	"testing"
+
+	"github.com/go-kit/kit/log"
+)
+
+type logger struct {
+	t *testing.T
+}
+
+// NewLogger returns a gokit compatible Logger which calls t.Log.
+func NewLogger(t *testing.T) log.Logger {
+	return logger{t: t}
+}
+
+// Log implements log.Logger.
+func (t logger) Log(keyvals ...interface{}) error {
+	t.t.Log(keyvals...)
+	return nil
+}
--- a/tsdb/testutil/testutil.go
+++ b/tsdb/testutil/testutil.go
@ -0,0 +1,87 @@
+// The MIT License (MIT)
+
+// Copyright (c) 2014 Ben Johnson
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+package testutil
+
+import (
+	"fmt"
+	"path/filepath"
+	"reflect"
+	"runtime"
+	"testing"
+)
+
+// Assert fails the test if the condition is false.
+func Assert(tb testing.TB, condition bool, msg string, v ...interface{}) {
+	if !condition {
+		_, file, line, _ := runtime.Caller(1)
+		fmt.Printf("\033[31m%s:%d: "+msg+"\033[39m\n\n", append([]interface{}{filepath.Base(file), line}, v...)...)
+		tb.FailNow()
+	}
+}
+
+// Ok fails the test if an err is not nil.
+func Ok(tb testing.TB, err error) {
+	if err != nil {
+		_, file, line, _ := runtime.Caller(1)
+		fmt.Printf("\033[31m%s:%d: unexpected error: %s\033[39m\n\n", filepath.Base(file), line, err.Error())
+		tb.FailNow()
+	}
+}
+
+// NotOk fails the test if an err is nil.
+func NotOk(tb testing.TB, err error) {
+	if err == nil {
+		_, file, line, _ := runtime.Caller(1)
+		fmt.Printf("\033[31m%s:%d: expected error, got nothing \033[39m\n\n", filepath.Base(file), line)
+		tb.FailNow()
+	}
+}
+
+// Equals fails the test if exp is not equal to act.
+func Equals(tb testing.TB, exp, act interface{}, msgAndArgs ...interface{}) {
+	if !reflect.DeepEqual(exp, act) {
+		_, file, line, _ := runtime.Caller(1)
+		fmt.Printf("\033[31m%s:%d:%s\n\n\texp: %#v\n\n\tgot: %#v\033[39m\n\n", filepath.Base(file), line, formatMessage(msgAndArgs), exp, act)
+		tb.FailNow()
+	}
+}
+
+// NotEquals fails the test if exp is equal to act.
+func NotEquals(tb testing.TB, exp, act interface{}) {
+	if reflect.DeepEqual(exp, act) {
+		_, file, line, _ := runtime.Caller(1)
+		fmt.Printf("\033[31m%s:%d: Expected different exp and got\n\n\texp: %#v\n\n\tgot: %#v\033[39m\n\n", filepath.Base(file), line, exp, act)
+		tb.FailNow()
+	}
+}
+
+func formatMessage(msgAndArgs []interface{}) string {
+	if len(msgAndArgs) == 0 {
+		return ""
+	}
+
+	if msg, ok := msgAndArgs[0].(string); ok {
+		return fmt.Sprintf("\n\nmsg: "+msg, msgAndArgs[1:]...)
+	}
+	return ""
+}
--- a/tsdb/tombstones.go
+++ b/tsdb/tombstones.go
@ -0,0 +1,304 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"github.com/go-kit/kit/log"
+	"github.com/go-kit/kit/log/level"
+	"github.com/pkg/errors"
+	"github.com/prometheus/tsdb/encoding"
+	tsdb_errors "github.com/prometheus/tsdb/errors"
+	"github.com/prometheus/tsdb/fileutil"
+)
+
+const tombstoneFilename = "tombstones"
+
+const (
+	// MagicTombstone is 4 bytes at the head of a tombstone file.
+	MagicTombstone = 0x0130BA30
+
+	tombstoneFormatV1 = 1
+)
+
+// TombstoneReader gives access to tombstone intervals by series reference.
+type TombstoneReader interface {
+	// Get returns deletion intervals for the series with the given reference.
+	Get(ref uint64) (Intervals, error)
+
+	// Iter calls the given function for each encountered interval.
+	Iter(func(uint64, Intervals) error) error
+
+	// Total returns the total count of tombstones.
+	Total() uint64
+
+	// Close any underlying resources
+	Close() error
+}
+
+func writeTombstoneFile(logger log.Logger, dir string, tr TombstoneReader) (int64, error) {
+	path := filepath.Join(dir, tombstoneFilename)
+	tmp := path + ".tmp"
+	hash := newCRC32()
+	var size int
+
+	f, err := os.Create(tmp)
+	if err != nil {
+		return 0, err
+	}
+	defer func() {
+		if f != nil {
+			if err := f.Close(); err != nil {
+				level.Error(logger).Log("msg", "close tmp file", "err", err.Error())
+			}
+		}
+		if err := os.RemoveAll(tmp); err != nil {
+			level.Error(logger).Log("msg", "remove tmp file", "err", err.Error())
+		}
+	}()
+
+	buf := encoding.Encbuf{B: make([]byte, 3*binary.MaxVarintLen64)}
+	buf.Reset()
+	// Write the meta.
+	buf.PutBE32(MagicTombstone)
+	buf.PutByte(tombstoneFormatV1)
+	n, err := f.Write(buf.Get())
+	if err != nil {
+		return 0, err
+	}
+	size += n
+
+	mw := io.MultiWriter(f, hash)
+
+	if err := tr.Iter(func(ref uint64, ivs Intervals) error {
+		for _, iv := range ivs {
+			buf.Reset()
+
+			buf.PutUvarint64(ref)
+			buf.PutVarint64(iv.Mint)
+			buf.PutVarint64(iv.Maxt)
+
+			n, err = mw.Write(buf.Get())
+			if err != nil {
+				return err
+			}
+			size += n
+		}
+		return nil
+	}); err != nil {
+		return 0, fmt.Errorf("error writing tombstones: %v", err)
+	}
+
+	n, err = f.Write(hash.Sum(nil))
+	if err != nil {
+		return 0, err
+	}
+	size += n
+
+	var merr tsdb_errors.MultiError
+	if merr.Add(f.Sync()); merr.Err() != nil {
+		merr.Add(f.Close())
+		return 0, merr.Err()
+	}
+
+	if err = f.Close(); err != nil {
+		return 0, err
+	}
+	f = nil
+	return int64(size), fileutil.Replace(tmp, path)
+}
+
+// Stone holds the information on the posting and time-range
+// that is deleted.
+type Stone struct {
+	ref       uint64
+	intervals Intervals
+}
+
+func readTombstones(dir string) (TombstoneReader, int64, error) {
+	b, err := ioutil.ReadFile(filepath.Join(dir, tombstoneFilename))
+	if os.IsNotExist(err) {
+		return newMemTombstones(), 0, nil
+	} else if err != nil {
+		return nil, 0, err
+	}
+
+	if len(b) < 5 {
+		return nil, 0, errors.Wrap(encoding.ErrInvalidSize, "tombstones header")
+	}
+
+	d := &encoding.Decbuf{B: b[:len(b)-4]} // 4 for the checksum.
+	if mg := d.Be32(); mg != MagicTombstone {
+		return nil, 0, fmt.Errorf("invalid magic number %x", mg)
+	}
+	if flag := d.Byte(); flag != tombstoneFormatV1 {
+		return nil, 0, fmt.Errorf("invalid tombstone format %x", flag)
+	}
+
+	if d.Err() != nil {
+		return nil, 0, d.Err()
+	}
+
+	// Verify checksum.
+	hash := newCRC32()
+	if _, err := hash.Write(d.Get()); err != nil {
+		return nil, 0, errors.Wrap(err, "write to hash")
+	}
+	if binary.BigEndian.Uint32(b[len(b)-4:]) != hash.Sum32() {
+		return nil, 0, errors.New("checksum did not match")
+	}
+
+	stonesMap := newMemTombstones()
+
+	for d.Len() > 0 {
+		k := d.Uvarint64()
+		mint := d.Varint64()
+		maxt := d.Varint64()
+		if d.Err() != nil {
+			return nil, 0, d.Err()
+		}
+
+		stonesMap.addInterval(k, Interval{mint, maxt})
+	}
+
+	return stonesMap, int64(len(b)), nil
+}
+
+type memTombstones struct {
+	intvlGroups map[uint64]Intervals
+	mtx         sync.RWMutex
+}
+
+// newMemTombstones creates new in memory TombstoneReader
+// that allows adding new intervals.
+func newMemTombstones() *memTombstones {
+	return &memTombstones{intvlGroups: make(map[uint64]Intervals)}
+}
+
+func (t *memTombstones) Get(ref uint64) (Intervals, error) {
+	t.mtx.RLock()
+	defer t.mtx.RUnlock()
+	return t.intvlGroups[ref], nil
+}
+
+func (t *memTombstones) Iter(f func(uint64, Intervals) error) error {
+	t.mtx.RLock()
+	defer t.mtx.RUnlock()
+	for ref, ivs := range t.intvlGroups {
+		if err := f(ref, ivs); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (t *memTombstones) Total() uint64 {
+	t.mtx.RLock()
+	defer t.mtx.RUnlock()
+
+	total := uint64(0)
+	for _, ivs := range t.intvlGroups {
+		total += uint64(len(ivs))
+	}
+	return total
+}
+
+// addInterval to an existing memTombstones
+func (t *memTombstones) addInterval(ref uint64, itvs ...Interval) {
+	t.mtx.Lock()
+	defer t.mtx.Unlock()
+	for _, itv := range itvs {
+		t.intvlGroups[ref] = t.intvlGroups[ref].add(itv)
+	}
+}
+
+func (*memTombstones) Close() error {
+	return nil
+}
+
+// Interval represents a single time-interval.
+type Interval struct {
+	Mint, Maxt int64
+}
+
+func (tr Interval) inBounds(t int64) bool {
+	return t >= tr.Mint && t <= tr.Maxt
+}
+
+func (tr Interval) isSubrange(dranges Intervals) bool {
+	for _, r := range dranges {
+		if r.inBounds(tr.Mint) && r.inBounds(tr.Maxt) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// Intervals represents	a set of increasing and non-overlapping time-intervals.
+type Intervals []Interval
+
+// add the new time-range to the existing ones.
+// The existing ones must be sorted.
+func (itvs Intervals) add(n Interval) Intervals {
+	for i, r := range itvs {
+		// TODO(gouthamve): Make this codepath easier to digest.
+		if r.inBounds(n.Mint-1) || r.inBounds(n.Mint) {
+			if n.Maxt > r.Maxt {
+				itvs[i].Maxt = n.Maxt
+			}
+
+			j := 0
+			for _, r2 := range itvs[i+1:] {
+				if n.Maxt < r2.Mint {
+					break
+				}
+				j++
+			}
+			if j != 0 {
+				if itvs[i+j].Maxt > n.Maxt {
+					itvs[i].Maxt = itvs[i+j].Maxt
+				}
+				itvs = append(itvs[:i+1], itvs[i+j+1:]...)
+			}
+			return itvs
+		}
+
+		if r.inBounds(n.Maxt+1) || r.inBounds(n.Maxt) {
+			if n.Mint < r.Maxt {
+				itvs[i].Mint = n.Mint
+			}
+			return itvs
+		}
+
+		if n.Mint < r.Mint {
+			newRange := make(Intervals, i, len(itvs[:i])+1)
+			copy(newRange, itvs[:i])
+			newRange = append(newRange, n)
+			newRange = append(newRange, itvs[i:]...)
+
+			return newRange
+		}
+	}
+
+	itvs = append(itvs, n)
+	return itvs
+}
--- a/tsdb/tombstones_test.go
+++ b/tsdb/tombstones_test.go
@ -0,0 +1,150 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/go-kit/kit/log"
+	"github.com/prometheus/tsdb/testutil"
+)
+
+func TestWriteAndReadbackTombStones(t *testing.T) {
+	tmpdir, _ := ioutil.TempDir("", "test")
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(tmpdir))
+	}()
+
+	ref := uint64(0)
+
+	stones := newMemTombstones()
+	// Generate the tombstones.
+	for i := 0; i < 100; i++ {
+		ref += uint64(rand.Int31n(10)) + 1
+		numRanges := rand.Intn(5) + 1
+		dranges := make(Intervals, 0, numRanges)
+		mint := rand.Int63n(time.Now().UnixNano())
+		for j := 0; j < numRanges; j++ {
+			dranges = dranges.add(Interval{mint, mint + rand.Int63n(1000)})
+			mint += rand.Int63n(1000) + 1
+		}
+		stones.addInterval(ref, dranges...)
+	}
+
+	_, err := writeTombstoneFile(log.NewNopLogger(), tmpdir, stones)
+	testutil.Ok(t, err)
+
+	restr, _, err := readTombstones(tmpdir)
+	testutil.Ok(t, err)
+
+	// Compare the two readers.
+	testutil.Equals(t, stones, restr)
+}
+
+func TestAddingNewIntervals(t *testing.T) {
+	cases := []struct {
+		exist Intervals
+		new   Interval
+
+		exp Intervals
+	}{
+		{
+			new: Interval{1, 2},
+			exp: Intervals{{1, 2}},
+		},
+		{
+			exist: Intervals{{1, 2}},
+			new:   Interval{1, 2},
+			exp:   Intervals{{1, 2}},
+		},
+		{
+			exist: Intervals{{1, 4}, {6, 6}},
+			new:   Interval{5, 6},
+			exp:   Intervals{{1, 6}},
+		},
+		{
+			exist: Intervals{{1, 10}, {12, 20}, {25, 30}},
+			new:   Interval{21, 23},
+			exp:   Intervals{{1, 10}, {12, 23}, {25, 30}},
+		},
+		{
+			exist: Intervals{{1, 2}, {3, 5}, {7, 7}},
+			new:   Interval{6, 7},
+			exp:   Intervals{{1, 2}, {3, 7}},
+		},
+		{
+			exist: Intervals{{1, 10}, {12, 20}, {25, 30}},
+			new:   Interval{21, 25},
+			exp:   Intervals{{1, 10}, {12, 30}},
+		},
+		{
+			exist: Intervals{{1, 10}, {12, 20}, {25, 30}},
+			new:   Interval{18, 23},
+			exp:   Intervals{{1, 10}, {12, 23}, {25, 30}},
+		},
+		{
+			exist: Intervals{{1, 10}, {12, 20}, {25, 30}},
+			new:   Interval{9, 23},
+			exp:   Intervals{{1, 23}, {25, 30}},
+		},
+		{
+			exist: Intervals{{1, 10}, {12, 20}, {25, 30}},
+			new:   Interval{9, 230},
+			exp:   Intervals{{1, 230}},
+		},
+		{
+			exist: Intervals{{5, 10}, {12, 20}, {25, 30}},
+			new:   Interval{1, 4},
+			exp:   Intervals{{1, 10}, {12, 20}, {25, 30}},
+		},
+		{
+			exist: Intervals{{5, 10}, {12, 20}, {25, 30}},
+			new:   Interval{11, 14},
+			exp:   Intervals{{5, 20}, {25, 30}},
+		},
+	}
+
+	for _, c := range cases {
+
+		testutil.Equals(t, c.exp, c.exist.add(c.new))
+	}
+}
+
+// TestMemTombstonesConcurrency to make sure they are safe to access from different goroutines.
+func TestMemTombstonesConcurrency(t *testing.T) {
+	tomb := newMemTombstones()
+	totalRuns := 100
+	var wg sync.WaitGroup
+	wg.Add(2)
+
+	go func() {
+		for x := 0; x < totalRuns; x++ {
+			tomb.addInterval(uint64(x), Interval{int64(x), int64(x)})
+		}
+		wg.Done()
+	}()
+	go func() {
+		for x := 0; x < totalRuns; x++ {
+			_, err := tomb.Get(uint64(x))
+			testutil.Ok(t, err)
+		}
+		wg.Done()
+	}()
+	wg.Wait()
+}
--- a/tsdb/tsdbutil/buffer.go
+++ b/tsdb/tsdbutil/buffer.go
@ -0,0 +1,236 @@
+// Copyright 2018 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdbutil
+
+import (
+	"math"
+)
+
+// SeriesIterator iterates over the data of a time series.
+type SeriesIterator interface {
+	// Seek advances the iterator forward to the given timestamp.
+	// If there's no value exactly at t, it advances to the first value
+	// after t.
+	Seek(t int64) bool
+	// At returns the current timestamp/value pair.
+	At() (t int64, v float64)
+	// Next advances the iterator by one.
+	Next() bool
+	// Err returns the current error.
+	Err() error
+}
+
+// BufferedSeriesIterator wraps an iterator with a look-back buffer.
+type BufferedSeriesIterator struct {
+	it  SeriesIterator
+	buf *sampleRing
+
+	lastTime int64
+}
+
+// NewBuffer returns a new iterator that buffers the values within the time range
+// of the current element and the duration of delta before.
+func NewBuffer(it SeriesIterator, delta int64) *BufferedSeriesIterator {
+	return &BufferedSeriesIterator{
+		it:       it,
+		buf:      newSampleRing(delta, 16),
+		lastTime: math.MinInt64,
+	}
+}
+
+// PeekBack returns the previous element of the iterator. If there is none buffered,
+// ok is false.
+func (b *BufferedSeriesIterator) PeekBack() (t int64, v float64, ok bool) {
+	return b.buf.last()
+}
+
+// Buffer returns an iterator over the buffered data.
+func (b *BufferedSeriesIterator) Buffer() SeriesIterator {
+	return b.buf.iterator()
+}
+
+// Seek advances the iterator to the element at time t or greater.
+func (b *BufferedSeriesIterator) Seek(t int64) bool {
+	t0 := t - b.buf.delta
+
+	// If the delta would cause us to seek backwards, preserve the buffer
+	// and just continue regular advancement while filling the buffer on the way.
+	if t0 > b.lastTime {
+		b.buf.reset()
+
+		ok := b.it.Seek(t0)
+		if !ok {
+			return false
+		}
+		b.lastTime, _ = b.At()
+	}
+
+	if b.lastTime >= t {
+		return true
+	}
+	for b.Next() {
+		if b.lastTime >= t {
+			return true
+		}
+	}
+
+	return false
+}
+
+// Next advances the iterator to the next element.
+func (b *BufferedSeriesIterator) Next() bool {
+	// Add current element to buffer before advancing.
+	b.buf.add(b.it.At())
+
+	ok := b.it.Next()
+	if ok {
+		b.lastTime, _ = b.At()
+	}
+	return ok
+}
+
+// At returns the current element of the iterator.
+func (b *BufferedSeriesIterator) At() (int64, float64) {
+	return b.it.At()
+}
+
+// Err returns the last encountered error.
+func (b *BufferedSeriesIterator) Err() error {
+	return b.it.Err()
+}
+
+type sample struct {
+	t int64
+	v float64
+}
+
+func (s sample) T() int64 {
+	return s.t
+}
+
+func (s sample) V() float64 {
+	return s.v
+}
+
+type sampleRing struct {
+	delta int64
+
+	buf []sample // lookback buffer
+	i   int      // position of most recent element in ring buffer
+	f   int      // position of first element in ring buffer
+	l   int      // number of elements in buffer
+}
+
+func newSampleRing(delta int64, sz int) *sampleRing {
+	r := &sampleRing{delta: delta, buf: make([]sample, sz)}
+	r.reset()
+
+	return r
+}
+
+func (r *sampleRing) reset() {
+	r.l = 0
+	r.i = -1
+	r.f = 0
+}
+
+func (r *sampleRing) iterator() SeriesIterator {
+	return &sampleRingIterator{r: r, i: -1}
+}
+
+type sampleRingIterator struct {
+	r *sampleRing
+	i int
+}
+
+func (it *sampleRingIterator) Next() bool {
+	it.i++
+	return it.i < it.r.l
+}
+
+func (it *sampleRingIterator) Seek(int64) bool {
+	return false
+}
+
+func (it *sampleRingIterator) Err() error {
+	return nil
+}
+
+func (it *sampleRingIterator) At() (int64, float64) {
+	return it.r.at(it.i)
+}
+
+func (r *sampleRing) at(i int) (int64, float64) {
+	j := (r.f + i) % len(r.buf)
+	s := r.buf[j]
+	return s.t, s.v
+}
+
+// add adds a sample to the ring buffer and frees all samples that fall
+// out of the delta range.
+func (r *sampleRing) add(t int64, v float64) {
+	l := len(r.buf)
+	// Grow the ring buffer if it fits no more elements.
+	if l == r.l {
+		buf := make([]sample, 2*l)
+		copy(buf[l+r.f:], r.buf[r.f:])
+		copy(buf, r.buf[:r.f])
+
+		r.buf = buf
+		r.i = r.f
+		r.f += l
+	} else {
+		r.i++
+		if r.i >= l {
+			r.i -= l
+		}
+	}
+
+	r.buf[r.i] = sample{t: t, v: v}
+	r.l++
+
+	// Free head of the buffer of samples that just fell out of the range.
+	for r.buf[r.f].t < t-r.delta {
+		r.f++
+		if r.f >= l {
+			r.f -= l
+		}
+		r.l--
+	}
+}
+
+// last returns the most recent element added to the ring.
+func (r *sampleRing) last() (int64, float64, bool) {
+	if r.l == 0 {
+		return 0, 0, false
+	}
+	s := r.buf[r.i]
+	return s.t, s.v, true
+}
+
+func (r *sampleRing) samples() []sample {
+	res := make([]sample, r.l)
+
+	var k = r.f + r.l
+	var j int
+	if k > len(r.buf) {
+		k = len(r.buf)
+		j = r.l - k + r.f
+	}
+
+	n := copy(res, r.buf[r.f:k])
+	copy(res[n:], r.buf[:j])
+
+	return res
+}
--- a/tsdb/tsdbutil/buffer_test.go
+++ b/tsdb/tsdbutil/buffer_test.go
@ -0,0 +1,173 @@
+// Copyright 2018 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdbutil
+
+import (
+	"math/rand"
+	"sort"
+	"testing"
+
+	"github.com/prometheus/tsdb/testutil"
+)
+
+func TestSampleRing(t *testing.T) {
+	cases := []struct {
+		input []int64
+		delta int64
+		size  int
+	}{
+		{
+			input: []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+			delta: 2,
+			size:  1,
+		},
+		{
+			input: []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+			delta: 2,
+			size:  2,
+		},
+		{
+			input: []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+			delta: 7,
+			size:  3,
+		},
+		{
+			input: []int64{1, 2, 3, 4, 5, 16, 17, 18, 19, 20},
+			delta: 7,
+			size:  1,
+		},
+	}
+	for _, c := range cases {
+		r := newSampleRing(c.delta, c.size)
+
+		input := []sample{}
+		for _, t := range c.input {
+			input = append(input, sample{
+				t: t,
+				v: float64(rand.Intn(100)),
+			})
+		}
+
+		for i, s := range input {
+			r.add(s.t, s.v)
+			buffered := r.samples()
+
+			for _, sold := range input[:i] {
+				found := false
+				for _, bs := range buffered {
+					if bs.t == sold.t && bs.v == sold.v {
+						found = true
+						break
+					}
+				}
+				if sold.t >= s.t-c.delta && !found {
+					t.Fatalf("%d: expected sample %d to be in buffer but was not; buffer %v", i, sold.t, buffered)
+				}
+				if sold.t < s.t-c.delta && found {
+					t.Fatalf("%d: unexpected sample %d in buffer; buffer %v", i, sold.t, buffered)
+				}
+			}
+		}
+	}
+}
+
+func TestBufferedSeriesIterator(t *testing.T) {
+	var it *BufferedSeriesIterator
+
+	bufferEq := func(exp []sample) {
+		var b []sample
+		bit := it.Buffer()
+		for bit.Next() {
+			t, v := bit.At()
+			b = append(b, sample{t: t, v: v})
+		}
+		testutil.Equals(t, exp, b)
+	}
+	sampleEq := func(ets int64, ev float64) {
+		ts, v := it.At()
+		testutil.Equals(t, ets, ts)
+		testutil.Equals(t, ev, v)
+	}
+
+	it = NewBuffer(newListSeriesIterator([]sample{
+		{t: 1, v: 2},
+		{t: 2, v: 3},
+		{t: 3, v: 4},
+		{t: 4, v: 5},
+		{t: 5, v: 6},
+		{t: 99, v: 8},
+		{t: 100, v: 9},
+		{t: 101, v: 10},
+	}), 2)
+
+	testutil.Assert(t, it.Seek(-123) == true, "seek failed")
+	sampleEq(1, 2)
+	bufferEq(nil)
+
+	testutil.Assert(t, it.Next() == true, "next failed")
+	sampleEq(2, 3)
+	bufferEq([]sample{{t: 1, v: 2}})
+
+	testutil.Assert(t, it.Next() == true, "next failed")
+	testutil.Assert(t, it.Next() == true, "next failed")
+	testutil.Assert(t, it.Next() == true, "next failed")
+	sampleEq(5, 6)
+	bufferEq([]sample{{t: 2, v: 3}, {t: 3, v: 4}, {t: 4, v: 5}})
+
+	testutil.Assert(t, it.Seek(5) == true, "seek failed")
+	sampleEq(5, 6)
+	bufferEq([]sample{{t: 2, v: 3}, {t: 3, v: 4}, {t: 4, v: 5}})
+
+	testutil.Assert(t, it.Seek(101) == true, "seek failed")
+	sampleEq(101, 10)
+	bufferEq([]sample{{t: 99, v: 8}, {t: 100, v: 9}})
+
+	testutil.Assert(t, it.Next() == false, "next succeeded unexpectedly")
+}
+
+type listSeriesIterator struct {
+	list []sample
+	idx  int
+}
+
+func newListSeriesIterator(list []sample) *listSeriesIterator {
+	return &listSeriesIterator{list: list, idx: -1}
+}
+
+func (it *listSeriesIterator) At() (int64, float64) {
+	s := it.list[it.idx]
+	return s.t, s.v
+}
+
+func (it *listSeriesIterator) Next() bool {
+	it.idx++
+	return it.idx < len(it.list)
+}
+
+func (it *listSeriesIterator) Seek(t int64) bool {
+	if it.idx == -1 {
+		it.idx = 0
+	}
+	// Do binary search between current position and end.
+	it.idx = sort.Search(len(it.list)-it.idx, func(i int) bool {
+		s := it.list[i+it.idx]
+		return s.t >= t
+	})
+
+	return it.idx < len(it.list)
+}
+
+func (it *listSeriesIterator) Err() error {
+	return nil
+}
--- a/tsdb/tsdbutil/chunks.go
+++ b/tsdb/tsdbutil/chunks.go
@ -0,0 +1,53 @@
+// Copyright 2018 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdbutil
+
+import (
+	"github.com/prometheus/tsdb/chunkenc"
+	"github.com/prometheus/tsdb/chunks"
+)
+
+type Sample interface {
+	T() int64
+	V() float64
+}
+
+func ChunkFromSamples(s []Sample) chunks.Meta {
+	mint, maxt := int64(0), int64(0)
+
+	if len(s) > 0 {
+		mint, maxt = s[0].T(), s[len(s)-1].T()
+	}
+
+	c := chunkenc.NewXORChunk()
+	ca, _ := c.Appender()
+
+	for _, s := range s {
+		ca.Append(s.T(), s.V())
+	}
+	return chunks.Meta{
+		MinTime: mint,
+		MaxTime: maxt,
+		Chunk:   c,
+	}
+}
+
+// PopulatedChunk creates a chunk populated with samples every second starting at minTime
+func PopulatedChunk(numSamples int, minTime int64) chunks.Meta {
+	samples := make([]Sample, numSamples)
+	for i := 0; i < numSamples; i++ {
+		samples[i] = sample{minTime + int64(i*1000), 1.0}
+	}
+	return ChunkFromSamples(samples)
+}
--- a/tsdb/wal.go
+++ b/tsdb/wal.go
--- a/tsdb/wal/live_reader.go
+++ b/tsdb/wal/live_reader.go
@ -0,0 +1,322 @@
+// Copyright 2019 The Prometheus Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package wal
+
+import (
+	"encoding/binary"
+	"fmt"
+	"hash/crc32"
+	"io"
+
+	"github.com/go-kit/kit/log"
+	"github.com/go-kit/kit/log/level"
+	"github.com/golang/snappy"
+	"github.com/pkg/errors"
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// liveReaderMetrics holds all metrics exposed by the LiveReader.
+type liveReaderMetrics struct {
+	readerCorruptionErrors *prometheus.CounterVec
+}
+
+// LiveReaderMetrics instatiates, registers and returns metrics to be injected
+// at LiveReader instantiation.
+func NewLiveReaderMetrics(reg prometheus.Registerer) *liveReaderMetrics {
+	m := &liveReaderMetrics{
+		readerCorruptionErrors: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "prometheus_tsdb_wal_reader_corruption_errors_total",
+			Help: "Errors encountered when reading the WAL.",
+		}, []string{"error"}),
+	}
+
+	if reg != nil {
+		reg.Register(m.readerCorruptionErrors)
+	}
+
+	return m
+}
+
+// NewLiveReader returns a new live reader.
+func NewLiveReader(logger log.Logger, metrics *liveReaderMetrics, r io.Reader) *LiveReader {
+	lr := &LiveReader{
+		logger:  logger,
+		rdr:     r,
+		metrics: metrics,
+
+		// Until we understand how they come about, make readers permissive
+		// to records spanning pages.
+		permissive: true,
+	}
+
+	return lr
+}
+
+// LiveReader reads WAL records from an io.Reader. It allows reading of WALs
+// that are still in the process of being written, and returns records as soon
+// as they can be read.
+type LiveReader struct {
+	logger     log.Logger
+	rdr        io.Reader
+	err        error
+	rec        []byte
+	snappyBuf  []byte
+	hdr        [recordHeaderSize]byte
+	buf        [pageSize]byte
+	readIndex  int   // Index in buf to start at for next read.
+	writeIndex int   // Index in buf to start at for next write.
+	total      int64 // Total bytes processed during reading in calls to Next().
+	index      int   // Used to track partial records, should be 0 at the start of every new record.
+
+	// For testing, we can treat EOF as a non-error.
+	eofNonErr bool
+
+	// We sometime see records span page boundaries.  Should never happen, but it
+	// does.  Until we track down why, set permissive to true to tolerate it.
+	// NB the non-ive Reader implementation allows for this.
+	permissive bool
+
+	metrics *liveReaderMetrics
+}
+
+// Err returns any errors encountered reading the WAL.  io.EOFs are not terminal
+// and Next can be tried again.  Non-EOFs are terminal, and the reader should
+// not be used again.  It is up to the user to decide when to stop trying should
+// io.EOF be returned.
+func (r *LiveReader) Err() error {
+	if r.eofNonErr && r.err == io.EOF {
+		return nil
+	}
+	return r.err
+}
+
+// Offset returns the number of bytes consumed from this segment.
+func (r *LiveReader) Offset() int64 {
+	return r.total
+}
+
+func (r *LiveReader) fillBuffer() (int, error) {
+	n, err := r.rdr.Read(r.buf[r.writeIndex:len(r.buf)])
+	r.writeIndex += n
+	return n, err
+}
+
+// Next returns true if Record() will contain a full record.
+// If Next returns false, you should always checked the contents of Error().
+// Return false guarantees there are no more records if the segment is closed
+// and not corrupt, otherwise if Err() == io.EOF you should try again when more
+// data has been written.
+func (r *LiveReader) Next() bool {
+	for {
+		// If buildRecord returns a non-EOF error, its game up - the segment is
+		// corrupt. If buildRecord returns an EOF, we try and read more in
+		// fillBuffer later on. If that fails to read anything (n=0 && err=EOF),
+		// we return  EOF and the user can try again later. If we have a full
+		// page, buildRecord is guaranteed to return a record or a non-EOF; it
+		// has checks the records fit in pages.
+		if ok, err := r.buildRecord(); ok {
+			return true
+		} else if err != nil && err != io.EOF {
+			r.err = err
+			return false
+		}
+
+		// If we've filled the page and not found a record, this
+		// means records have started to span pages.  Shouldn't happen
+		// but does and until we found out why, we need to deal with this.
+		if r.permissive && r.writeIndex == pageSize && r.readIndex > 0 {
+			copy(r.buf[:], r.buf[r.readIndex:])
+			r.writeIndex -= r.readIndex
+			r.readIndex = 0
+			continue
+		}
+
+		if r.readIndex == pageSize {
+			r.writeIndex = 0
+			r.readIndex = 0
+		}
+
+		if r.writeIndex != pageSize {
+			n, err := r.fillBuffer()
+			if n == 0 || (err != nil && err != io.EOF) {
+				r.err = err
+				return false
+			}
+		}
+	}
+}
+
+// Record returns the current record.
+// The returned byte slice is only valid until the next call to Next.
+func (r *LiveReader) Record() []byte {
+	return r.rec
+}
+
+// Rebuild a full record from potentially partial records. Returns false
+// if there was an error or if we weren't able to read a record for any reason.
+// Returns true if we read a full record. Any record data is appended to
+// LiveReader.rec
+func (r *LiveReader) buildRecord() (bool, error) {
+	for {
+		// Check that we have data in the internal buffer to read.
+		if r.writeIndex <= r.readIndex {
+			return false, nil
+		}
+
+		// Attempt to read a record, partial or otherwise.
+		temp, n, err := r.readRecord()
+		if err != nil {
+			return false, err
+		}
+
+		r.readIndex += n
+		r.total += int64(n)
+		if temp == nil {
+			return false, nil
+		}
+
+		rt := recTypeFromHeader(r.hdr[0])
+		if rt == recFirst || rt == recFull {
+			r.rec = r.rec[:0]
+			r.snappyBuf = r.snappyBuf[:0]
+		}
+
+		compressed := r.hdr[0]&snappyMask != 0
+		if compressed {
+			r.snappyBuf = append(r.snappyBuf, temp...)
+		} else {
+			r.rec = append(r.rec, temp...)
+		}
+
+		if err := validateRecord(rt, r.index); err != nil {
+			r.index = 0
+			return false, err
+		}
+		if rt == recLast || rt == recFull {
+			r.index = 0
+			if compressed && len(r.snappyBuf) > 0 {
+				// The snappy library uses `len` to calculate if we need a new buffer.
+				// In order to allocate as few buffers as possible make the length
+				// equal to the capacity.
+				r.rec = r.rec[:cap(r.rec)]
+				r.rec, err = snappy.Decode(r.rec, r.snappyBuf)
+				if err != nil {
+					return false, err
+				}
+			}
+			return true, nil
+		}
+		// Only increment i for non-zero records since we use it
+		// to determine valid content record sequences.
+		r.index++
+	}
+}
+
+// Returns an error if the recType and i indicate an invalid record sequence.
+// As an example, if i is > 0 because we've read some amount of a partial record
+// (recFirst, recMiddle, etc. but not recLast) and then we get another recFirst or recFull
+// instead of a recLast or recMiddle we would have an invalid record.
+func validateRecord(typ recType, i int) error {
+	switch typ {
+	case recFull:
+		if i != 0 {
+			return errors.New("unexpected full record")
+		}
+		return nil
+	case recFirst:
+		if i != 0 {
+			return errors.New("unexpected first record, dropping buffer")
+		}
+		return nil
+	case recMiddle:
+		if i == 0 {
+			return errors.New("unexpected middle record, dropping buffer")
+		}
+		return nil
+	case recLast:
+		if i == 0 {
+			return errors.New("unexpected last record, dropping buffer")
+		}
+		return nil
+	default:
+		return errors.Errorf("unexpected record type %d", typ)
+	}
+}
+
+// Read a sub-record (see recType) from the buffer. It could potentially
+// be a full record (recFull) if the record fits within the bounds of a single page.
+// Returns a byte slice of the record data read, the number of bytes read, and an error
+// if there's a non-zero byte in a page term record or the record checksum fails.
+// This is a non-method function to make it clear it does not mutate the reader.
+func (r *LiveReader) readRecord() ([]byte, int, error) {
+	// Special case: for recPageTerm, check that are all zeros to end of page,
+	// consume them but don't return them.
+	if r.buf[r.readIndex] == byte(recPageTerm) {
+		// End of page won't necessarily be end of buffer, as we may have
+		// got misaligned by records spanning page boundaries.
+		// r.total % pageSize is the offset into the current page
+		// that r.readIndex points to in buf.  Therefore
+		// pageSize - (r.total % pageSize) is the amount left to read of
+		// the current page.
+		remaining := int(pageSize - (r.total % pageSize))
+		if r.readIndex+remaining > r.writeIndex {
+			return nil, 0, io.EOF
+		}
+
+		for i := r.readIndex; i < r.readIndex+remaining; i++ {
+			if r.buf[i] != 0 {
+				return nil, 0, errors.New("unexpected non-zero byte in page term bytes")
+			}
+		}
+
+		return nil, remaining, nil
+	}
+
+	// Not a recPageTerm; read the record and check the checksum.
+	if r.writeIndex-r.readIndex < recordHeaderSize {
+		return nil, 0, io.EOF
+	}
+
+	copy(r.hdr[:], r.buf[r.readIndex:r.readIndex+recordHeaderSize])
+	length := int(binary.BigEndian.Uint16(r.hdr[1:]))
+	crc := binary.BigEndian.Uint32(r.hdr[3:])
+	if r.readIndex+recordHeaderSize+length > pageSize {
+		if !r.permissive {
+			return nil, 0, fmt.Errorf("record would overflow current page: %d > %d", r.readIndex+recordHeaderSize+length, pageSize)
+		}
+		r.metrics.readerCorruptionErrors.WithLabelValues("record_span_page").Inc()
+		level.Warn(r.logger).Log("msg", "record spans page boundaries", "start", r.readIndex, "end", recordHeaderSize+length, "pageSize", pageSize)
+	}
+	if recordHeaderSize+length > pageSize {
+		return nil, 0, fmt.Errorf("record length greater than a single page: %d > %d", recordHeaderSize+length, pageSize)
+	}
+	if r.readIndex+recordHeaderSize+length > r.writeIndex {
+		return nil, 0, io.EOF
+	}
+
+	rec := r.buf[r.readIndex+recordHeaderSize : r.readIndex+recordHeaderSize+length]
+	if c := crc32.Checksum(rec, castagnoliTable); c != crc {
+		return nil, 0, errors.Errorf("unexpected checksum %x, expected %x", c, crc)
+	}
+
+	return rec, length + recordHeaderSize, nil
+}
+
+func min(i, j int) int {
+	if i < j {
+		return i
+	}
+	return j
+}
--- a/tsdb/wal/reader.go
+++ b/tsdb/wal/reader.go
@ -0,0 +1,200 @@
+// Copyright 2019 The Prometheus Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package wal
+
+import (
+	"encoding/binary"
+	"hash/crc32"
+	"io"
+
+	"github.com/golang/snappy"
+	"github.com/pkg/errors"
+)
+
+// Reader reads WAL records from an io.Reader.
+type Reader struct {
+	rdr       io.Reader
+	err       error
+	rec       []byte
+	snappyBuf []byte
+	buf       [pageSize]byte
+	total     int64   // Total bytes processed.
+	curRecTyp recType // Used for checking that the last record is not torn.
+}
+
+// NewReader returns a new reader.
+func NewReader(r io.Reader) *Reader {
+	return &Reader{rdr: r}
+}
+
+// Next advances the reader to the next records and returns true if it exists.
+// It must not be called again after it returned false.
+func (r *Reader) Next() bool {
+	err := r.next()
+	if errors.Cause(err) == io.EOF {
+		// The last WAL segment record shouldn't be torn(should be full or last).
+		// The last record would be torn after a crash just before
+		// the last record part could be persisted to disk.
+		if r.curRecTyp == recFirst || r.curRecTyp == recMiddle {
+			r.err = errors.New("last record is torn")
+		}
+		return false
+	}
+	r.err = err
+	return r.err == nil
+}
+
+func (r *Reader) next() (err error) {
+	// We have to use r.buf since allocating byte arrays here fails escape
+	// analysis and ends up on the heap, even though it seemingly should not.
+	hdr := r.buf[:recordHeaderSize]
+	buf := r.buf[recordHeaderSize:]
+
+	r.rec = r.rec[:0]
+	r.snappyBuf = r.snappyBuf[:0]
+
+	i := 0
+	for {
+		if _, err = io.ReadFull(r.rdr, hdr[:1]); err != nil {
+			return errors.Wrap(err, "read first header byte")
+		}
+		r.total++
+		r.curRecTyp = recTypeFromHeader(hdr[0])
+		compressed := hdr[0]&snappyMask != 0
+
+		// Gobble up zero bytes.
+		if r.curRecTyp == recPageTerm {
+			// recPageTerm is a single byte that indicates the rest of the page is padded.
+			// If it's the first byte in a page, buf is too small and
+			// needs to be resized to fit pageSize-1 bytes.
+			buf = r.buf[1:]
+
+			// We are pedantic and check whether the zeros are actually up
+			// to a page boundary.
+			// It's not strictly necessary but may catch sketchy state early.
+			k := pageSize - (r.total % pageSize)
+			if k == pageSize {
+				continue // Initial 0 byte was last page byte.
+			}
+			n, err := io.ReadFull(r.rdr, buf[:k])
+			if err != nil {
+				return errors.Wrap(err, "read remaining zeros")
+			}
+			r.total += int64(n)
+
+			for _, c := range buf[:k] {
+				if c != 0 {
+					return errors.New("unexpected non-zero byte in padded page")
+				}
+			}
+			continue
+		}
+		n, err := io.ReadFull(r.rdr, hdr[1:])
+		if err != nil {
+			return errors.Wrap(err, "read remaining header")
+		}
+		r.total += int64(n)
+
+		var (
+			length = binary.BigEndian.Uint16(hdr[1:])
+			crc    = binary.BigEndian.Uint32(hdr[3:])
+		)
+
+		if length > pageSize-recordHeaderSize {
+			return errors.Errorf("invalid record size %d", length)
+		}
+		n, err = io.ReadFull(r.rdr, buf[:length])
+		if err != nil {
+			return err
+		}
+		r.total += int64(n)
+
+		if n != int(length) {
+			return errors.Errorf("invalid size: expected %d, got %d", length, n)
+		}
+		if c := crc32.Checksum(buf[:length], castagnoliTable); c != crc {
+			return errors.Errorf("unexpected checksum %x, expected %x", c, crc)
+		}
+
+		if compressed {
+			r.snappyBuf = append(r.snappyBuf, buf[:length]...)
+		} else {
+			r.rec = append(r.rec, buf[:length]...)
+		}
+
+		if err := validateRecord(r.curRecTyp, i); err != nil {
+			return err
+		}
+		if r.curRecTyp == recLast || r.curRecTyp == recFull {
+			if compressed && len(r.snappyBuf) > 0 {
+				// The snappy library uses `len` to calculate if we need a new buffer.
+				// In order to allocate as few buffers as possible make the length
+				// equal to the capacity.
+				r.rec = r.rec[:cap(r.rec)]
+				r.rec, err = snappy.Decode(r.rec, r.snappyBuf)
+				return err
+			}
+			return nil
+		}
+
+		// Only increment i for non-zero records since we use it
+		// to determine valid content record sequences.
+		i++
+	}
+}
+
+// Err returns the last encountered error wrapped in a corruption error.
+// If the reader does not allow to infer a segment index and offset, a total
+// offset in the reader stream will be provided.
+func (r *Reader) Err() error {
+	if r.err == nil {
+		return nil
+	}
+	if b, ok := r.rdr.(*segmentBufReader); ok {
+		return &CorruptionErr{
+			Err:     r.err,
+			Dir:     b.segs[b.cur].Dir(),
+			Segment: b.segs[b.cur].Index(),
+			Offset:  int64(b.off),
+		}
+	}
+	return &CorruptionErr{
+		Err:     r.err,
+		Segment: -1,
+		Offset:  r.total,
+	}
+}
+
+// Record returns the current record. The returned byte slice is only
+// valid until the next call to Next.
+func (r *Reader) Record() []byte {
+	return r.rec
+}
+
+// Segment returns the current segment being read.
+func (r *Reader) Segment() int {
+	if b, ok := r.rdr.(*segmentBufReader); ok {
+		return b.segs[b.cur].Index()
+	}
+	return -1
+}
+
+// Offset returns the current position of the segment being read.
+func (r *Reader) Offset() int64 {
+	if b, ok := r.rdr.(*segmentBufReader); ok {
+		return int64(b.off)
+	}
+	return r.total
+}
--- a/tsdb/wal/reader_test.go
+++ b/tsdb/wal/reader_test.go
@ -0,0 +1,549 @@
+// Copyright 2019 The Prometheus Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package wal
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"hash/crc32"
+	"io"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"testing"
+	"time"
+
+	"github.com/go-kit/kit/log"
+	tsdb_errors "github.com/prometheus/tsdb/errors"
+	"github.com/prometheus/tsdb/testutil"
+)
+
+type reader interface {
+	Next() bool
+	Err() error
+	Record() []byte
+	Offset() int64
+}
+
+type record struct {
+	t recType
+	b []byte
+}
+
+var readerConstructors = map[string]func(io.Reader) reader{
+	"Reader": func(r io.Reader) reader {
+		return NewReader(r)
+	},
+	"LiveReader": func(r io.Reader) reader {
+		lr := NewLiveReader(log.NewNopLogger(), NewLiveReaderMetrics(nil), r)
+		lr.eofNonErr = true
+		return lr
+	},
+}
+
+var data = make([]byte, 100000)
+var testReaderCases = []struct {
+	t    []record
+	exp  [][]byte
+	fail bool
+}{
+	// Sequence of valid records.
+	{
+		t: []record{
+			{recFull, data[0:200]},
+			{recFirst, data[200:300]},
+			{recLast, data[300:400]},
+			{recFirst, data[400:800]},
+			{recMiddle, data[800:900]},
+			{recPageTerm, make([]byte, pageSize-900-recordHeaderSize*5-1)}, // exactly lines up with page boundary.
+			{recLast, data[900:900]},
+			{recFirst, data[900:1000]},
+			{recMiddle, data[1000:1200]},
+			{recMiddle, data[1200:30000]},
+			{recMiddle, data[30000:30001]},
+			{recMiddle, data[30001:30001]},
+			{recLast, data[30001:32000]},
+		},
+		exp: [][]byte{
+			data[0:200],
+			data[200:400],
+			data[400:900],
+			data[900:32000],
+		},
+	},
+	// Exactly at the limit of one page minus the header size
+	{
+		t: []record{
+			{recFull, data[0 : pageSize-recordHeaderSize]},
+		},
+		exp: [][]byte{
+			data[:pageSize-recordHeaderSize],
+		},
+	},
+	// More than a full page, this exceeds our buffer and can never happen
+	// when written by the WAL.
+	{
+		t: []record{
+			{recFull, data[0 : pageSize+1]},
+		},
+		fail: true,
+	},
+	// Two records the together are too big for a page.
+	// NB currently the non-live reader succeeds on this. I think this is a bug.
+	// but we've seen it in production.
+	{
+		t: []record{
+			{recFull, data[:pageSize/2]},
+			{recFull, data[:pageSize/2]},
+		},
+		exp: [][]byte{
+			data[:pageSize/2],
+			data[:pageSize/2],
+		},
+	},
+	// Invalid orders of record types.
+	{
+		t:    []record{{recMiddle, data[:200]}},
+		fail: true,
+	},
+	{
+		t:    []record{{recLast, data[:200]}},
+		fail: true,
+	},
+	{
+		t: []record{
+			{recFirst, data[:200]},
+			{recFull, data[200:400]},
+		},
+		fail: true,
+	},
+	{
+		t: []record{
+			{recFirst, data[:100]},
+			{recMiddle, data[100:200]},
+			{recFull, data[200:400]},
+		},
+		fail: true,
+	},
+	// Non-zero data after page termination.
+	{
+		t: []record{
+			{recFull, data[:100]},
+			{recPageTerm, append(make([]byte, pageSize-recordHeaderSize-102), 1)},
+		},
+		exp:  [][]byte{data[:100]},
+		fail: true,
+	},
+}
+
+func encodedRecord(t recType, b []byte) []byte {
+	if t == recPageTerm {
+		return append([]byte{0}, b...)
+	}
+	r := make([]byte, recordHeaderSize)
+	r[0] = byte(t)
+	binary.BigEndian.PutUint16(r[1:], uint16(len(b)))
+	binary.BigEndian.PutUint32(r[3:], crc32.Checksum(b, castagnoliTable))
+	return append(r, b...)
+}
+
+// TestReader feeds the reader a stream of encoded records with different types.
+func TestReader(t *testing.T) {
+	for name, fn := range readerConstructors {
+		for i, c := range testReaderCases {
+			t.Run(fmt.Sprintf("%s/%d", name, i), func(t *testing.T) {
+				var buf []byte
+				for _, r := range c.t {
+					buf = append(buf, encodedRecord(r.t, r.b)...)
+				}
+				r := fn(bytes.NewReader(buf))
+
+				for j := 0; r.Next(); j++ {
+					t.Logf("record %d", j)
+					rec := r.Record()
+
+					if j >= len(c.exp) {
+						t.Fatal("received more records than expected")
+					}
+					testutil.Equals(t, c.exp[j], rec, "Bytes within record did not match expected Bytes")
+				}
+				if !c.fail && r.Err() != nil {
+					t.Fatalf("unexpected error: %s", r.Err())
+				}
+				if c.fail && r.Err() == nil {
+					t.Fatalf("expected error but got none")
+				}
+			})
+		}
+	}
+}
+
+func TestReader_Live(t *testing.T) {
+	logger := testutil.NewLogger(t)
+
+	for i := range testReaderCases {
+		t.Run(strconv.Itoa(i), func(t *testing.T) {
+			writeFd, err := ioutil.TempFile("", "TestReader_Live")
+			testutil.Ok(t, err)
+			defer os.Remove(writeFd.Name())
+
+			go func(i int) {
+				for _, rec := range testReaderCases[i].t {
+					rec := encodedRecord(rec.t, rec.b)
+					_, err := writeFd.Write(rec)
+					testutil.Ok(t, err)
+					runtime.Gosched()
+				}
+				writeFd.Close()
+			}(i)
+
+			// Read from a second FD on the same file.
+			readFd, err := os.Open(writeFd.Name())
+			testutil.Ok(t, err)
+			reader := NewLiveReader(logger, NewLiveReaderMetrics(nil), readFd)
+			for _, exp := range testReaderCases[i].exp {
+				for !reader.Next() {
+					testutil.Assert(t, reader.Err() == io.EOF, "expect EOF, got: %v", reader.Err())
+					runtime.Gosched()
+				}
+
+				actual := reader.Record()
+				testutil.Equals(t, exp, actual, "read wrong record")
+			}
+
+			testutil.Assert(t, !reader.Next(), "unexpected record")
+			if testReaderCases[i].fail {
+				testutil.Assert(t, reader.Err() != nil, "expected error")
+			}
+		})
+	}
+}
+
+const fuzzLen = 500
+
+func generateRandomEntries(w *WAL, records chan []byte) error {
+	var recs [][]byte
+	for i := 0; i < fuzzLen; i++ {
+		var sz int64
+		switch i % 5 {
+		case 0, 1:
+			sz = 50
+		case 2, 3:
+			sz = pageSize
+		default:
+			sz = pageSize * 8
+		}
+
+		rec := make([]byte, rand.Int63n(sz))
+		if _, err := rand.Read(rec); err != nil {
+			return err
+		}
+
+		records <- rec
+
+		// Randomly batch up records.
+		recs = append(recs, rec)
+		if rand.Intn(4) < 3 {
+			if err := w.Log(recs...); err != nil {
+				return err
+			}
+			recs = recs[:0]
+		}
+	}
+	return w.Log(recs...)
+}
+
+type multiReadCloser struct {
+	reader  io.Reader
+	closers []io.Closer
+}
+
+func (m *multiReadCloser) Read(p []byte) (n int, err error) {
+	return m.reader.Read(p)
+}
+func (m *multiReadCloser) Close() error {
+	var merr tsdb_errors.MultiError
+	for _, closer := range m.closers {
+		merr.Add(closer.Close())
+	}
+	return merr.Err()
+}
+
+func allSegments(dir string) (io.ReadCloser, error) {
+	seg, err := listSegments(dir)
+	if err != nil {
+		return nil, err
+	}
+
+	var readers []io.Reader
+	var closers []io.Closer
+	for _, r := range seg {
+		f, err := os.Open(filepath.Join(dir, r.name))
+		if err != nil {
+			return nil, err
+		}
+		readers = append(readers, f)
+		closers = append(closers, f)
+	}
+
+	return &multiReadCloser{
+		reader:  io.MultiReader(readers...),
+		closers: closers,
+	}, nil
+}
+
+func TestReaderFuzz(t *testing.T) {
+	for name, fn := range readerConstructors {
+		for _, compress := range []bool{false, true} {
+			t.Run(fmt.Sprintf("%s,compress=%t", name, compress), func(t *testing.T) {
+				dir, err := ioutil.TempDir("", "wal_fuzz_live")
+				testutil.Ok(t, err)
+				defer func() {
+					testutil.Ok(t, os.RemoveAll(dir))
+				}()
+
+				w, err := NewSize(nil, nil, dir, 128*pageSize, compress)
+				testutil.Ok(t, err)
+
+				// Buffering required as we're not reading concurrently.
+				input := make(chan []byte, fuzzLen)
+				err = generateRandomEntries(w, input)
+				testutil.Ok(t, err)
+				close(input)
+
+				err = w.Close()
+				testutil.Ok(t, err)
+
+				sr, err := allSegments(w.Dir())
+				testutil.Ok(t, err)
+				defer sr.Close()
+
+				reader := fn(sr)
+				for expected := range input {
+					testutil.Assert(t, reader.Next(), "expected record: %v", reader.Err())
+					testutil.Equals(t, expected, reader.Record(), "read wrong record")
+				}
+				testutil.Assert(t, !reader.Next(), "unexpected record")
+			})
+		}
+	}
+}
+
+func TestReaderFuzz_Live(t *testing.T) {
+	logger := testutil.NewLogger(t)
+	for _, compress := range []bool{false, true} {
+		t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) {
+			dir, err := ioutil.TempDir("", "wal_fuzz_live")
+			testutil.Ok(t, err)
+			defer func() {
+				testutil.Ok(t, os.RemoveAll(dir))
+			}()
+
+			w, err := NewSize(nil, nil, dir, 128*pageSize, compress)
+			testutil.Ok(t, err)
+			defer w.Close()
+
+			// In the background, generate a stream of random records and write them
+			// to the WAL.
+			input := make(chan []byte, fuzzLen/10) // buffering required as we sometimes batch WAL writes.
+			done := make(chan struct{})
+			go func() {
+				err := generateRandomEntries(w, input)
+				testutil.Ok(t, err)
+				time.Sleep(100 * time.Millisecond)
+				close(done)
+			}()
+
+			// Tail the WAL and compare the results.
+			m, _, err := w.Segments()
+			testutil.Ok(t, err)
+
+			seg, err := OpenReadSegment(SegmentName(dir, m))
+			testutil.Ok(t, err)
+			defer seg.Close()
+
+			r := NewLiveReader(logger, nil, seg)
+			segmentTicker := time.NewTicker(100 * time.Millisecond)
+			readTicker := time.NewTicker(10 * time.Millisecond)
+
+			readSegment := func(r *LiveReader) bool {
+				for r.Next() {
+					rec := r.Record()
+					expected, ok := <-input
+					testutil.Assert(t, ok, "unexpected record")
+					testutil.Equals(t, expected, rec, "record does not match expected")
+				}
+				testutil.Assert(t, r.Err() == io.EOF, "expected EOF, got: %v", r.Err())
+				return true
+			}
+
+		outer:
+			for {
+				select {
+				case <-segmentTicker.C:
+					// check if new segments exist
+					_, last, err := w.Segments()
+					testutil.Ok(t, err)
+					if last <= seg.i {
+						continue
+					}
+
+					// read to end of segment.
+					readSegment(r)
+
+					fi, err := os.Stat(SegmentName(dir, seg.i))
+					testutil.Ok(t, err)
+					testutil.Assert(t, r.Offset() == fi.Size(), "expected to have read whole segment, but read %d of %d", r.Offset(), fi.Size())
+
+					seg, err = OpenReadSegment(SegmentName(dir, seg.i+1))
+					testutil.Ok(t, err)
+					defer seg.Close()
+					r = NewLiveReader(logger, nil, seg)
+
+				case <-readTicker.C:
+					readSegment(r)
+
+				case <-done:
+					readSegment(r)
+					break outer
+				}
+			}
+
+			testutil.Assert(t, r.Err() == io.EOF, "expected EOF")
+		})
+	}
+}
+
+func TestLiveReaderCorrupt_ShortFile(t *testing.T) {
+	// Write a corrupt WAL segment, there is one record of pageSize in length,
+	// but the segment is only half written.
+	logger := testutil.NewLogger(t)
+	dir, err := ioutil.TempDir("", "wal_live_corrupt")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	w, err := NewSize(nil, nil, dir, pageSize, false)
+	testutil.Ok(t, err)
+
+	rec := make([]byte, pageSize-recordHeaderSize)
+	_, err = rand.Read(rec)
+	testutil.Ok(t, err)
+
+	err = w.Log(rec)
+	testutil.Ok(t, err)
+
+	err = w.Close()
+	testutil.Ok(t, err)
+
+	segmentFile, err := os.OpenFile(filepath.Join(dir, "00000000"), os.O_RDWR, 0666)
+	testutil.Ok(t, err)
+
+	err = segmentFile.Truncate(pageSize / 2)
+	testutil.Ok(t, err)
+
+	err = segmentFile.Close()
+	testutil.Ok(t, err)
+
+	// Try and LiveReader it.
+	m, _, err := w.Segments()
+	testutil.Ok(t, err)
+
+	seg, err := OpenReadSegment(SegmentName(dir, m))
+	testutil.Ok(t, err)
+	defer seg.Close()
+
+	r := NewLiveReader(logger, nil, seg)
+	testutil.Assert(t, r.Next() == false, "expected no records")
+	testutil.Assert(t, r.Err() == io.EOF, "expected error, got: %v", r.Err())
+}
+
+func TestLiveReaderCorrupt_RecordTooLongAndShort(t *testing.T) {
+	// Write a corrupt WAL segment, when record len > page size.
+	logger := testutil.NewLogger(t)
+	dir, err := ioutil.TempDir("", "wal_live_corrupt")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	w, err := NewSize(nil, nil, dir, pageSize*2, false)
+	testutil.Ok(t, err)
+
+	rec := make([]byte, pageSize-recordHeaderSize)
+	_, err = rand.Read(rec)
+	testutil.Ok(t, err)
+
+	err = w.Log(rec)
+	testutil.Ok(t, err)
+
+	err = w.Close()
+	testutil.Ok(t, err)
+
+	segmentFile, err := os.OpenFile(filepath.Join(dir, "00000000"), os.O_RDWR, 0666)
+	testutil.Ok(t, err)
+
+	// Override the record length
+	buf := make([]byte, 3)
+	buf[0] = byte(recFull)
+	binary.BigEndian.PutUint16(buf[1:], 0xFFFF)
+	_, err = segmentFile.WriteAt(buf, 0)
+	testutil.Ok(t, err)
+
+	err = segmentFile.Close()
+	testutil.Ok(t, err)
+
+	// Try and LiveReader it.
+	m, _, err := w.Segments()
+	testutil.Ok(t, err)
+
+	seg, err := OpenReadSegment(SegmentName(dir, m))
+	testutil.Ok(t, err)
+	defer seg.Close()
+
+	r := NewLiveReader(logger, NewLiveReaderMetrics(nil), seg)
+	testutil.Assert(t, r.Next() == false, "expected no records")
+	testutil.Assert(t, r.Err().Error() == "record length greater than a single page: 65542 > 32768", "expected error, got: %v", r.Err())
+}
+
+func TestReaderData(t *testing.T) {
+	dir := os.Getenv("WALDIR")
+	if dir == "" {
+		return
+	}
+
+	for name, fn := range readerConstructors {
+		t.Run(name, func(t *testing.T) {
+			w, err := New(nil, nil, dir, true)
+			testutil.Ok(t, err)
+
+			sr, err := allSegments(dir)
+			testutil.Ok(t, err)
+
+			reader := fn(sr)
+			for reader.Next() {
+			}
+			testutil.Ok(t, reader.Err())
+
+			err = w.Repair(reader.Err())
+			testutil.Ok(t, err)
+		})
+	}
+}
--- a/tsdb/wal/wal.go
+++ b/tsdb/wal/wal.go
@ -0,0 +1,856 @@
+// Copyright 2017 The Prometheus Authors
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package wal
+
+import (
+	"bufio"
+	"encoding/binary"
+	"fmt"
+	"hash/crc32"
+	"io"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"sync"
+	"time"
+
+	"github.com/go-kit/kit/log"
+	"github.com/go-kit/kit/log/level"
+	"github.com/golang/snappy"
+	"github.com/pkg/errors"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/tsdb/fileutil"
+)
+
+const (
+	DefaultSegmentSize = 128 * 1024 * 1024 // 128 MB
+	pageSize           = 32 * 1024         // 32KB
+	recordHeaderSize   = 7
+)
+
+// The table gets initialized with sync.Once but may still cause a race
+// with any other use of the crc32 package anywhere. Thus we initialize it
+// before.
+var castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
+
+// page is an in memory buffer used to batch disk writes.
+// Records bigger than the page size are split and flushed separately.
+// A flush is triggered when a single records doesn't fit the page size or
+// when the next record can't fit in the remaining free page space.
+type page struct {
+	alloc   int
+	flushed int
+	buf     [pageSize]byte
+}
+
+func (p *page) remaining() int {
+	return pageSize - p.alloc
+}
+
+func (p *page) full() bool {
+	return pageSize-p.alloc < recordHeaderSize
+}
+
+// Segment represents a segment file.
+type Segment struct {
+	*os.File
+	dir string
+	i   int
+}
+
+// Index returns the index of the segment.
+func (s *Segment) Index() int {
+	return s.i
+}
+
+// Dir returns the directory of the segment.
+func (s *Segment) Dir() string {
+	return s.dir
+}
+
+// CorruptionErr is an error that's returned when corruption is encountered.
+type CorruptionErr struct {
+	Dir     string
+	Segment int
+	Offset  int64
+	Err     error
+}
+
+func (e *CorruptionErr) Error() string {
+	if e.Segment < 0 {
+		return fmt.Sprintf("corruption after %d bytes: %s", e.Offset, e.Err)
+	}
+	return fmt.Sprintf("corruption in segment %s at %d: %s", SegmentName(e.Dir, e.Segment), e.Offset, e.Err)
+}
+
+// OpenWriteSegment opens segment k in dir. The returned segment is ready for new appends.
+func OpenWriteSegment(logger log.Logger, dir string, k int) (*Segment, error) {
+	segName := SegmentName(dir, k)
+	f, err := os.OpenFile(segName, os.O_WRONLY|os.O_APPEND, 0666)
+	if err != nil {
+		return nil, err
+	}
+	stat, err := f.Stat()
+	if err != nil {
+		f.Close()
+		return nil, err
+	}
+	// If the last page is torn, fill it with zeros.
+	// In case it was torn after all records were written successfully, this
+	// will just pad the page and everything will be fine.
+	// If it was torn mid-record, a full read (which the caller should do anyway
+	// to ensure integrity) will detect it as a corruption by the end.
+	if d := stat.Size() % pageSize; d != 0 {
+		level.Warn(logger).Log("msg", "last page of the wal is torn, filling it with zeros", "segment", segName)
+		if _, err := f.Write(make([]byte, pageSize-d)); err != nil {
+			f.Close()
+			return nil, errors.Wrap(err, "zero-pad torn page")
+		}
+	}
+	return &Segment{File: f, i: k, dir: dir}, nil
+}
+
+// CreateSegment creates a new segment k in dir.
+func CreateSegment(dir string, k int) (*Segment, error) {
+	f, err := os.OpenFile(SegmentName(dir, k), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666)
+	if err != nil {
+		return nil, err
+	}
+	return &Segment{File: f, i: k, dir: dir}, nil
+}
+
+// OpenReadSegment opens the segment with the given filename.
+func OpenReadSegment(fn string) (*Segment, error) {
+	k, err := strconv.Atoi(filepath.Base(fn))
+	if err != nil {
+		return nil, errors.New("not a valid filename")
+	}
+	f, err := os.Open(fn)
+	if err != nil {
+		return nil, err
+	}
+	return &Segment{File: f, i: k, dir: filepath.Dir(fn)}, nil
+}
+
+// WAL is a write ahead log that stores records in segment files.
+// It must be read from start to end once before logging new data.
+// If an error occurs during read, the repair procedure must be called
+// before it's safe to do further writes.
+//
+// Segments are written to in pages of 32KB, with records possibly split
+// across page boundaries.
+// Records are never split across segments to allow full segments to be
+// safely truncated. It also ensures that torn writes never corrupt records
+// beyond the most recent segment.
+type WAL struct {
+	dir         string
+	logger      log.Logger
+	segmentSize int
+	mtx         sync.RWMutex
+	segment     *Segment // Active segment.
+	donePages   int      // Pages written to the segment.
+	page        *page    // Active page.
+	stopc       chan chan struct{}
+	actorc      chan func()
+	closed      bool // To allow calling Close() more than once without blocking.
+	compress    bool
+	snappyBuf   []byte
+
+	fsyncDuration   prometheus.Summary
+	pageFlushes     prometheus.Counter
+	pageCompletions prometheus.Counter
+	truncateFail    prometheus.Counter
+	truncateTotal   prometheus.Counter
+	currentSegment  prometheus.Gauge
+}
+
+// New returns a new WAL over the given directory.
+func New(logger log.Logger, reg prometheus.Registerer, dir string, compress bool) (*WAL, error) {
+	return NewSize(logger, reg, dir, DefaultSegmentSize, compress)
+}
+
+// NewSize returns a new WAL over the given directory.
+// New segments are created with the specified size.
+func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSize int, compress bool) (*WAL, error) {
+	if segmentSize%pageSize != 0 {
+		return nil, errors.New("invalid segment size")
+	}
+	if err := os.MkdirAll(dir, 0777); err != nil {
+		return nil, errors.Wrap(err, "create dir")
+	}
+	if logger == nil {
+		logger = log.NewNopLogger()
+	}
+	w := &WAL{
+		dir:         dir,
+		logger:      logger,
+		segmentSize: segmentSize,
+		page:        &page{},
+		actorc:      make(chan func(), 100),
+		stopc:       make(chan chan struct{}),
+		compress:    compress,
+	}
+	registerMetrics(reg, w)
+
+	_, j, err := w.Segments()
+	// Index of the Segment we want to open and write to.
+	writeSegmentIndex := 0
+	if err != nil {
+		return nil, errors.Wrap(err, "get segment range")
+	}
+	// If some segments already exist create one with a higher index than the last segment.
+	if j != -1 {
+		writeSegmentIndex = j + 1
+	}
+
+	segment, err := CreateSegment(w.dir, writeSegmentIndex)
+	if err != nil {
+		return nil, err
+	}
+
+	if err := w.setSegment(segment); err != nil {
+		return nil, err
+	}
+
+	go w.run()
+
+	return w, nil
+}
+
+// Open an existing WAL.
+func Open(logger log.Logger, reg prometheus.Registerer, dir string) (*WAL, error) {
+	if logger == nil {
+		logger = log.NewNopLogger()
+	}
+	w := &WAL{
+		dir:    dir,
+		logger: logger,
+	}
+
+	registerMetrics(reg, w)
+	return w, nil
+}
+
+func registerMetrics(reg prometheus.Registerer, w *WAL) {
+	w.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{
+		Name:       "prometheus_tsdb_wal_fsync_duration_seconds",
+		Help:       "Duration of WAL fsync.",
+		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
+	})
+	w.pageFlushes = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "prometheus_tsdb_wal_page_flushes_total",
+		Help: "Total number of page flushes.",
+	})
+	w.pageCompletions = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "prometheus_tsdb_wal_completed_pages_total",
+		Help: "Total number of completed pages.",
+	})
+	w.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "prometheus_tsdb_wal_truncations_failed_total",
+		Help: "Total number of WAL truncations that failed.",
+	})
+	w.truncateTotal = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "prometheus_tsdb_wal_truncations_total",
+		Help: "Total number of WAL truncations attempted.",
+	})
+	w.currentSegment = prometheus.NewGauge(prometheus.GaugeOpts{
+		Name: "prometheus_tsdb_wal_segment_current",
+		Help: "WAL segment index that TSDB is currently writing to.",
+	})
+	if reg != nil {
+		reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions, w.truncateFail, w.truncateTotal, w.currentSegment)
+	}
+}
+
+// CompressionEnabled returns if compression is enabled on this WAL.
+func (w *WAL) CompressionEnabled() bool {
+	return w.compress
+}
+
+// Dir returns the directory of the WAL.
+func (w *WAL) Dir() string {
+	return w.dir
+}
+
+func (w *WAL) run() {
+Loop:
+	for {
+		select {
+		case f := <-w.actorc:
+			f()
+		case donec := <-w.stopc:
+			close(w.actorc)
+			defer close(donec)
+			break Loop
+		}
+	}
+	// Drain and process any remaining functions.
+	for f := range w.actorc {
+		f()
+	}
+}
+
+// Repair attempts to repair the WAL based on the error.
+// It discards all data after the corruption.
+func (w *WAL) Repair(origErr error) error {
+	// We could probably have a mode that only discards torn records right around
+	// the corruption to preserve as data much as possible.
+	// But that's not generally applicable if the records have any kind of causality.
+	// Maybe as an extra mode in the future if mid-WAL corruptions become
+	// a frequent concern.
+	err := errors.Cause(origErr) // So that we can pick up errors even if wrapped.
+
+	cerr, ok := err.(*CorruptionErr)
+	if !ok {
+		return errors.Wrap(origErr, "cannot handle error")
+	}
+	if cerr.Segment < 0 {
+		return errors.New("corruption error does not specify position")
+	}
+	level.Warn(w.logger).Log("msg", "starting corruption repair",
+		"segment", cerr.Segment, "offset", cerr.Offset)
+
+	// All segments behind the corruption can no longer be used.
+	segs, err := listSegments(w.dir)
+	if err != nil {
+		return errors.Wrap(err, "list segments")
+	}
+	level.Warn(w.logger).Log("msg", "deleting all segments newer than corrupted segment", "segment", cerr.Segment)
+
+	for _, s := range segs {
+		if w.segment.i == s.index {
+			// The active segment needs to be removed,
+			// close it first (Windows!). Can be closed safely
+			// as we set the current segment to repaired file
+			// below.
+			if err := w.segment.Close(); err != nil {
+				return errors.Wrap(err, "close active segment")
+			}
+		}
+		if s.index <= cerr.Segment {
+			continue
+		}
+		if err := os.Remove(filepath.Join(w.dir, s.name)); err != nil {
+			return errors.Wrapf(err, "delete segment:%v", s.index)
+		}
+	}
+	// Regardless of the corruption offset, no record reaches into the previous segment.
+	// So we can safely repair the WAL by removing the segment and re-inserting all
+	// its records up to the corruption.
+	level.Warn(w.logger).Log("msg", "rewrite corrupted segment", "segment", cerr.Segment)
+
+	fn := SegmentName(w.dir, cerr.Segment)
+	tmpfn := fn + ".repair"
+
+	if err := fileutil.Rename(fn, tmpfn); err != nil {
+		return err
+	}
+	// Create a clean segment and make it the active one.
+	s, err := CreateSegment(w.dir, cerr.Segment)
+	if err != nil {
+		return err
+	}
+	if err := w.setSegment(s); err != nil {
+		return err
+	}
+
+	f, err := os.Open(tmpfn)
+	if err != nil {
+		return errors.Wrap(err, "open segment")
+	}
+	defer f.Close()
+
+	r := NewReader(bufio.NewReader(f))
+
+	for r.Next() {
+		// Add records only up to the where the error was.
+		if r.Offset() >= cerr.Offset {
+			break
+		}
+		if err := w.Log(r.Record()); err != nil {
+			return errors.Wrap(err, "insert record")
+		}
+	}
+	// We expect an error here from r.Err(), so nothing to handle.
+
+	// We need to pad to the end of the last page in the repaired segment
+	w.flushPage(true)
+
+	// We explicitly close even when there is a defer for Windows to be
+	// able to delete it. The defer is in place to close it in-case there
+	// are errors above.
+	if err := f.Close(); err != nil {
+		return errors.Wrap(err, "close corrupted file")
+	}
+	if err := os.Remove(tmpfn); err != nil {
+		return errors.Wrap(err, "delete corrupted segment")
+	}
+
+	// Explicitly close the the segment we just repaired to avoid issues with Windows.
+	s.Close()
+
+	// We always want to start writing to a new Segment rather than an existing
+	// Segment, which is handled by NewSize, but earlier in Repair we're deleting
+	// all segments that come after the corrupted Segment. Recreate a new Segment here.
+	s, err = CreateSegment(w.dir, cerr.Segment+1)
+	if err != nil {
+		return err
+	}
+	if err := w.setSegment(s); err != nil {
+		return err
+	}
+	return nil
+}
+
+// SegmentName builds a segment name for the directory.
+func SegmentName(dir string, i int) string {
+	return filepath.Join(dir, fmt.Sprintf("%08d", i))
+}
+
+// NextSegment creates the next segment and closes the previous one.
+func (w *WAL) NextSegment() error {
+	w.mtx.Lock()
+	defer w.mtx.Unlock()
+	return w.nextSegment()
+}
+
+// nextSegment creates the next segment and closes the previous one.
+func (w *WAL) nextSegment() error {
+	// Only flush the current page if it actually holds data.
+	if w.page.alloc > 0 {
+		if err := w.flushPage(true); err != nil {
+			return err
+		}
+	}
+	next, err := CreateSegment(w.dir, w.segment.Index()+1)
+	if err != nil {
+		return errors.Wrap(err, "create new segment file")
+	}
+	prev := w.segment
+	if err := w.setSegment(next); err != nil {
+		return err
+	}
+
+	// Don't block further writes by fsyncing the last segment.
+	w.actorc <- func() {
+		if err := w.fsync(prev); err != nil {
+			level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
+		}
+		if err := prev.Close(); err != nil {
+			level.Error(w.logger).Log("msg", "close previous segment", "err", err)
+		}
+	}
+	return nil
+}
+
+func (w *WAL) setSegment(segment *Segment) error {
+	w.segment = segment
+
+	// Correctly initialize donePages.
+	stat, err := segment.Stat()
+	if err != nil {
+		return err
+	}
+	w.donePages = int(stat.Size() / pageSize)
+	w.currentSegment.Set(float64(segment.Index()))
+	return nil
+}
+
+// flushPage writes the new contents of the page to disk. If no more records will fit into
+// the page, the remaining bytes will be set to zero and a new page will be started.
+// If clear is true, this is enforced regardless of how many bytes are left in the page.
+func (w *WAL) flushPage(clear bool) error {
+	w.pageFlushes.Inc()
+
+	p := w.page
+	clear = clear || p.full()
+
+	// No more data will fit into the page or an implicit clear.
+	// Enqueue and clear it.
+	if clear {
+		p.alloc = pageSize // Write till end of page.
+	}
+	n, err := w.segment.Write(p.buf[p.flushed:p.alloc])
+	if err != nil {
+		return err
+	}
+	p.flushed += n
+
+	// We flushed an entire page, prepare a new one.
+	if clear {
+		for i := range p.buf {
+			p.buf[i] = 0
+		}
+		p.alloc = 0
+		p.flushed = 0
+		w.donePages++
+		w.pageCompletions.Inc()
+	}
+	return nil
+}
+
+// First Byte of header format:
+// [ 4 bits unallocated] [1 bit snappy compression flag] [ 3 bit record type ]
+const (
+	snappyMask  = 1 << 3
+	recTypeMask = snappyMask - 1
+)
+
+type recType uint8
+
+const (
+	recPageTerm recType = 0 // Rest of page is empty.
+	recFull     recType = 1 // Full record.
+	recFirst    recType = 2 // First fragment of a record.
+	recMiddle   recType = 3 // Middle fragments of a record.
+	recLast     recType = 4 // Final fragment of a record.
+)
+
+func recTypeFromHeader(header byte) recType {
+	return recType(header & recTypeMask)
+}
+
+func (t recType) String() string {
+	switch t {
+	case recPageTerm:
+		return "zero"
+	case recFull:
+		return "full"
+	case recFirst:
+		return "first"
+	case recMiddle:
+		return "middle"
+	case recLast:
+		return "last"
+	default:
+		return "<invalid>"
+	}
+}
+
+func (w *WAL) pagesPerSegment() int {
+	return w.segmentSize / pageSize
+}
+
+// Log writes the records into the log.
+// Multiple records can be passed at once to reduce writes and increase throughput.
+func (w *WAL) Log(recs ...[]byte) error {
+	w.mtx.Lock()
+	defer w.mtx.Unlock()
+	// Callers could just implement their own list record format but adding
+	// a bit of extra logic here frees them from that overhead.
+	for i, r := range recs {
+		if err := w.log(r, i == len(recs)-1); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// log writes rec to the log and forces a flush of the current page if:
+// - the final record of a batch
+// - the record is bigger than the page size
+// - the current page is full.
+func (w *WAL) log(rec []byte, final bool) error {
+	// When the last page flush failed the page will remain full.
+	// When the page is full, need to flush it before trying to add more records to it.
+	if w.page.full() {
+		if err := w.flushPage(true); err != nil {
+			return err
+		}
+	}
+	// If the record is too big to fit within the active page in the current
+	// segment, terminate the active segment and advance to the next one.
+	// This ensures that records do not cross segment boundaries.
+	left := w.page.remaining() - recordHeaderSize                                   // Free space in the active page.
+	left += (pageSize - recordHeaderSize) * (w.pagesPerSegment() - w.donePages - 1) // Free pages in the active segment.
+
+	if len(rec) > left {
+		if err := w.nextSegment(); err != nil {
+			return err
+		}
+	}
+
+	compressed := false
+	if w.compress && len(rec) > 0 {
+		// The snappy library uses `len` to calculate if we need a new buffer.
+		// In order to allocate as few buffers as possible make the length
+		// equal to the capacity.
+		w.snappyBuf = w.snappyBuf[:cap(w.snappyBuf)]
+		w.snappyBuf = snappy.Encode(w.snappyBuf, rec)
+		if len(w.snappyBuf) < len(rec) {
+			rec = w.snappyBuf
+			compressed = true
+		}
+	}
+
+	// Populate as many pages as necessary to fit the record.
+	// Be careful to always do one pass to ensure we write zero-length records.
+	for i := 0; i == 0 || len(rec) > 0; i++ {
+		p := w.page
+
+		// Find how much of the record we can fit into the page.
+		var (
+			l    = min(len(rec), (pageSize-p.alloc)-recordHeaderSize)
+			part = rec[:l]
+			buf  = p.buf[p.alloc:]
+			typ  recType
+		)
+
+		switch {
+		case i == 0 && len(part) == len(rec):
+			typ = recFull
+		case len(part) == len(rec):
+			typ = recLast
+		case i == 0:
+			typ = recFirst
+		default:
+			typ = recMiddle
+		}
+		if compressed {
+			typ |= snappyMask
+		}
+
+		buf[0] = byte(typ)
+		crc := crc32.Checksum(part, castagnoliTable)
+		binary.BigEndian.PutUint16(buf[1:], uint16(len(part)))
+		binary.BigEndian.PutUint32(buf[3:], crc)
+
+		copy(buf[recordHeaderSize:], part)
+		p.alloc += len(part) + recordHeaderSize
+
+		if w.page.full() {
+			if err := w.flushPage(true); err != nil {
+				return err
+			}
+		}
+		rec = rec[l:]
+	}
+
+	// If it's the final record of the batch and the page is not empty, flush it.
+	if final && w.page.alloc > 0 {
+		if err := w.flushPage(false); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// Segments returns the range [first, n] of currently existing segments.
+// If no segments are found, first and n are -1.
+func (w *WAL) Segments() (first, last int, err error) {
+	refs, err := listSegments(w.dir)
+	if err != nil {
+		return 0, 0, err
+	}
+	if len(refs) == 0 {
+		return -1, -1, nil
+	}
+	return refs[0].index, refs[len(refs)-1].index, nil
+}
+
+// Truncate drops all segments before i.
+func (w *WAL) Truncate(i int) (err error) {
+	w.truncateTotal.Inc()
+	defer func() {
+		if err != nil {
+			w.truncateFail.Inc()
+		}
+	}()
+	refs, err := listSegments(w.dir)
+	if err != nil {
+		return err
+	}
+	for _, r := range refs {
+		if r.index >= i {
+			break
+		}
+		if err = os.Remove(filepath.Join(w.dir, r.name)); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (w *WAL) fsync(f *Segment) error {
+	start := time.Now()
+	err := f.File.Sync()
+	w.fsyncDuration.Observe(time.Since(start).Seconds())
+	return err
+}
+
+// Close flushes all writes and closes active segment.
+func (w *WAL) Close() (err error) {
+	w.mtx.Lock()
+	defer w.mtx.Unlock()
+
+	if w.closed {
+		return errors.New("wal already closed")
+	}
+
+	// Flush the last page and zero out all its remaining size.
+	// We must not flush an empty page as it would falsely signal
+	// the segment is done if we start writing to it again after opening.
+	if w.page.alloc > 0 {
+		if err := w.flushPage(true); err != nil {
+			return err
+		}
+	}
+
+	donec := make(chan struct{})
+	w.stopc <- donec
+	<-donec
+
+	if err = w.fsync(w.segment); err != nil {
+		level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
+	}
+	if err := w.segment.Close(); err != nil {
+		level.Error(w.logger).Log("msg", "close previous segment", "err", err)
+	}
+	w.closed = true
+	return nil
+}
+
+type segmentRef struct {
+	name  string
+	index int
+}
+
+func listSegments(dir string) (refs []segmentRef, err error) {
+	files, err := fileutil.ReadDir(dir)
+	if err != nil {
+		return nil, err
+	}
+	var last int
+	for _, fn := range files {
+		k, err := strconv.Atoi(fn)
+		if err != nil {
+			continue
+		}
+		if len(refs) > 0 && k > last+1 {
+			return nil, errors.New("segments are not sequential")
+		}
+		refs = append(refs, segmentRef{name: fn, index: k})
+		last = k
+	}
+	sort.Slice(refs, func(i, j int) bool {
+		return refs[i].index < refs[j].index
+	})
+	return refs, nil
+}
+
+// SegmentRange groups segments by the directory and the first and last index it includes.
+type SegmentRange struct {
+	Dir         string
+	First, Last int
+}
+
+// NewSegmentsReader returns a new reader over all segments in the directory.
+func NewSegmentsReader(dir string) (io.ReadCloser, error) {
+	return NewSegmentsRangeReader(SegmentRange{dir, -1, -1})
+}
+
+// NewSegmentsRangeReader returns a new reader over the given WAL segment ranges.
+// If first or last are -1, the range is open on the respective end.
+func NewSegmentsRangeReader(sr ...SegmentRange) (io.ReadCloser, error) {
+	var segs []*Segment
+
+	for _, sgmRange := range sr {
+		refs, err := listSegments(sgmRange.Dir)
+		if err != nil {
+			return nil, errors.Wrapf(err, "list segment in dir:%v", sgmRange.Dir)
+		}
+
+		for _, r := range refs {
+			if sgmRange.First >= 0 && r.index < sgmRange.First {
+				continue
+			}
+			if sgmRange.Last >= 0 && r.index > sgmRange.Last {
+				break
+			}
+			s, err := OpenReadSegment(filepath.Join(sgmRange.Dir, r.name))
+			if err != nil {
+				return nil, errors.Wrapf(err, "open segment:%v in dir:%v", r.name, sgmRange.Dir)
+			}
+			segs = append(segs, s)
+		}
+	}
+	return NewSegmentBufReader(segs...), nil
+}
+
+// segmentBufReader is a buffered reader that reads in multiples of pages.
+// The main purpose is that we are able to track segment and offset for
+// corruption reporting.  We have to be careful not to increment curr too
+// early, as it is used by Reader.Err() to tell Repair which segment is corrupt.
+// As such we pad the end of non-page align segments with zeros.
+type segmentBufReader struct {
+	buf  *bufio.Reader
+	segs []*Segment
+	cur  int // Index into segs.
+	off  int // Offset of read data into current segment.
+}
+
+func NewSegmentBufReader(segs ...*Segment) *segmentBufReader {
+	return &segmentBufReader{
+		buf:  bufio.NewReaderSize(segs[0], 16*pageSize),
+		segs: segs,
+	}
+}
+
+func (r *segmentBufReader) Close() (err error) {
+	for _, s := range r.segs {
+		if e := s.Close(); e != nil {
+			err = e
+		}
+	}
+	return err
+}
+
+// Read implements io.Reader.
+func (r *segmentBufReader) Read(b []byte) (n int, err error) {
+	n, err = r.buf.Read(b)
+	r.off += n
+
+	// If we succeeded, or hit a non-EOF, we can stop.
+	if err == nil || err != io.EOF {
+		return n, err
+	}
+
+	// We hit EOF; fake out zero padding at the end of short segments, so we
+	// don't increment curr too early and report the wrong segment as corrupt.
+	if r.off%pageSize != 0 {
+		i := 0
+		for ; n+i < len(b) && (r.off+i)%pageSize != 0; i++ {
+			b[n+i] = 0
+		}
+
+		// Return early, even if we didn't fill b.
+		r.off += i
+		return n + i, nil
+	}
+
+	// There is no more deta left in the curr segment and there are no more
+	// segments left.  Return EOF.
+	if r.cur+1 >= len(r.segs) {
+		return n, io.EOF
+	}
+
+	// Move to next segment.
+	r.cur++
+	r.off = 0
+	r.buf.Reset(r.segs[r.cur])
+	return n, nil
+}
--- a/tsdb/wal/wal_test.go
+++ b/tsdb/wal/wal_test.go
@ -0,0 +1,477 @@
+// Copyright 2017 The Prometheus Authors
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package wal
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"path/filepath"
+	"testing"
+
+	client_testutil "github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/prometheus/tsdb/testutil"
+)
+
+// TestWALRepair_ReadingError ensures that a repair is run for an error
+// when reading a record.
+func TestWALRepair_ReadingError(t *testing.T) {
+	for name, test := range map[string]struct {
+		corrSgm    int              // Which segment to corrupt.
+		corrFunc   func(f *os.File) // Func that applies the corruption.
+		intactRecs int              // Total expected records left after the repair.
+	}{
+		"torn_last_record": {
+			2,
+			func(f *os.File) {
+				_, err := f.Seek(pageSize*2, 0)
+				testutil.Ok(t, err)
+				_, err = f.Write([]byte{byte(recFirst)})
+				testutil.Ok(t, err)
+			},
+			8,
+		},
+		// Ensures that the page buffer is big enough to fit
+		// an entire page size without panicing.
+		// https://github.com/prometheus/tsdb/pull/414
+		"bad_header": {
+			1,
+			func(f *os.File) {
+				_, err := f.Seek(pageSize, 0)
+				testutil.Ok(t, err)
+				_, err = f.Write([]byte{byte(recPageTerm)})
+				testutil.Ok(t, err)
+			},
+			4,
+		},
+		"bad_fragment_sequence": {
+			1,
+			func(f *os.File) {
+				_, err := f.Seek(pageSize, 0)
+				testutil.Ok(t, err)
+				_, err = f.Write([]byte{byte(recLast)})
+				testutil.Ok(t, err)
+			},
+			4,
+		},
+		"bad_fragment_flag": {
+			1,
+			func(f *os.File) {
+				_, err := f.Seek(pageSize, 0)
+				testutil.Ok(t, err)
+				_, err = f.Write([]byte{123})
+				testutil.Ok(t, err)
+			},
+			4,
+		},
+		"bad_checksum": {
+			1,
+			func(f *os.File) {
+				_, err := f.Seek(pageSize+4, 0)
+				testutil.Ok(t, err)
+				_, err = f.Write([]byte{0})
+				testutil.Ok(t, err)
+			},
+			4,
+		},
+		"bad_length": {
+			1,
+			func(f *os.File) {
+				_, err := f.Seek(pageSize+2, 0)
+				testutil.Ok(t, err)
+				_, err = f.Write([]byte{0})
+				testutil.Ok(t, err)
+			},
+			4,
+		},
+		"bad_content": {
+			1,
+			func(f *os.File) {
+				_, err := f.Seek(pageSize+100, 0)
+				testutil.Ok(t, err)
+				_, err = f.Write([]byte("beef"))
+				testutil.Ok(t, err)
+			},
+			4,
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir("", "wal_repair")
+			testutil.Ok(t, err)
+			defer func() {
+				testutil.Ok(t, os.RemoveAll(dir))
+			}()
+
+			// We create 3 segments with 3 records each and
+			// then corrupt a given record in a given segment.
+			// As a result we want a repaired WAL with given intact records.
+			segSize := 3 * pageSize
+			w, err := NewSize(nil, nil, dir, segSize, false)
+			testutil.Ok(t, err)
+
+			var records [][]byte
+
+			for i := 1; i <= 9; i++ {
+				b := make([]byte, pageSize-recordHeaderSize)
+				b[0] = byte(i)
+				records = append(records, b)
+				testutil.Ok(t, w.Log(b))
+			}
+			first, last, err := w.Segments()
+			testutil.Ok(t, err)
+			testutil.Equals(t, 3, 1+last-first, "wal creation didn't result in expected number of segments")
+
+			testutil.Ok(t, w.Close())
+
+			f, err := os.OpenFile(SegmentName(dir, test.corrSgm), os.O_RDWR, 0666)
+			testutil.Ok(t, err)
+
+			// Apply corruption function.
+			test.corrFunc(f)
+
+			testutil.Ok(t, f.Close())
+
+			w, err = NewSize(nil, nil, dir, segSize, false)
+			testutil.Ok(t, err)
+			defer w.Close()
+
+			first, last, err = w.Segments()
+			testutil.Ok(t, err)
+
+			// Backfill segments from the most recent checkpoint onwards.
+			for i := first; i <= last; i++ {
+				s, err := OpenReadSegment(SegmentName(w.Dir(), i))
+				testutil.Ok(t, err)
+
+				sr := NewSegmentBufReader(s)
+				testutil.Ok(t, err)
+				r := NewReader(sr)
+				for r.Next() {
+				}
+
+				//Close the segment so we don't break things on Windows.
+				s.Close()
+
+				// No corruption in this segment.
+				if r.Err() == nil {
+					continue
+				}
+				testutil.Ok(t, w.Repair(r.Err()))
+				break
+			}
+
+			sr, err := NewSegmentsReader(dir)
+			testutil.Ok(t, err)
+			defer sr.Close()
+			r := NewReader(sr)
+
+			var result [][]byte
+			for r.Next() {
+				var b []byte
+				result = append(result, append(b, r.Record()...))
+			}
+			testutil.Ok(t, r.Err())
+			testutil.Equals(t, test.intactRecs, len(result), "Wrong number of intact records")
+
+			for i, r := range result {
+				if !bytes.Equal(records[i], r) {
+					t.Fatalf("record %d diverges: want %x, got %x", i, records[i][:10], r[:10])
+				}
+			}
+
+			// Make sure there is a new 0 size Segment after the corrupted Segment.
+			_, last, err = w.Segments()
+			testutil.Ok(t, err)
+			testutil.Equals(t, test.corrSgm+1, last)
+			fi, err := os.Stat(SegmentName(dir, last))
+			testutil.Ok(t, err)
+			testutil.Equals(t, int64(0), fi.Size())
+		})
+	}
+}
+
+// TestCorruptAndCarryOn writes a multi-segment WAL; corrupts the first segment and
+// ensures that an error during reading that segment are correctly repaired before
+// moving to write more records to the WAL.
+func TestCorruptAndCarryOn(t *testing.T) {
+	dir, err := ioutil.TempDir("", "wal_repair")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	var (
+		logger      = testutil.NewLogger(t)
+		segmentSize = pageSize * 3
+		recordSize  = (pageSize / 3) - recordHeaderSize
+	)
+
+	// Produce a WAL with a two segments of 3 pages with 3 records each,
+	// so when we truncate the file we're guaranteed to split a record.
+	{
+		w, err := NewSize(logger, nil, dir, segmentSize, false)
+		testutil.Ok(t, err)
+
+		for i := 0; i < 18; i++ {
+			buf := make([]byte, recordSize)
+			_, err := rand.Read(buf)
+			testutil.Ok(t, err)
+
+			err = w.Log(buf)
+			testutil.Ok(t, err)
+		}
+
+		err = w.Close()
+		testutil.Ok(t, err)
+	}
+
+	// Check all the segments are the correct size.
+	{
+		segments, err := listSegments(dir)
+		testutil.Ok(t, err)
+		for _, segment := range segments {
+			f, err := os.OpenFile(filepath.Join(dir, fmt.Sprintf("%08d", segment.index)), os.O_RDONLY, 0666)
+			testutil.Ok(t, err)
+
+			fi, err := f.Stat()
+			testutil.Ok(t, err)
+
+			t.Log("segment", segment.index, "size", fi.Size())
+			testutil.Equals(t, int64(segmentSize), fi.Size())
+
+			err = f.Close()
+			testutil.Ok(t, err)
+		}
+	}
+
+	// Truncate the first file, splitting the middle record in the second
+	// page in half, leaving 4 valid records.
+	{
+		f, err := os.OpenFile(filepath.Join(dir, fmt.Sprintf("%08d", 0)), os.O_RDWR, 0666)
+		testutil.Ok(t, err)
+
+		fi, err := f.Stat()
+		testutil.Ok(t, err)
+		testutil.Equals(t, int64(segmentSize), fi.Size())
+
+		err = f.Truncate(int64(segmentSize / 2))
+		testutil.Ok(t, err)
+
+		err = f.Close()
+		testutil.Ok(t, err)
+	}
+
+	// Now try and repair this WAL, and write 5 more records to it.
+	{
+		sr, err := NewSegmentsReader(dir)
+		testutil.Ok(t, err)
+
+		reader := NewReader(sr)
+		i := 0
+		for ; i < 4 && reader.Next(); i++ {
+			testutil.Equals(t, recordSize, len(reader.Record()))
+		}
+		testutil.Equals(t, 4, i, "not enough records")
+		testutil.Assert(t, !reader.Next(), "unexpected record")
+
+		corruptionErr := reader.Err()
+		testutil.Assert(t, corruptionErr != nil, "expected error")
+
+		err = sr.Close()
+		testutil.Ok(t, err)
+
+		w, err := NewSize(logger, nil, dir, segmentSize, false)
+		testutil.Ok(t, err)
+
+		err = w.Repair(corruptionErr)
+		testutil.Ok(t, err)
+
+		// Ensure that we have a completely clean slate after reapiring.
+		testutil.Equals(t, w.segment.Index(), 1) // We corrupted segment 0.
+		testutil.Equals(t, w.donePages, 0)
+
+		for i := 0; i < 5; i++ {
+			buf := make([]byte, recordSize)
+			_, err := rand.Read(buf)
+			testutil.Ok(t, err)
+
+			err = w.Log(buf)
+			testutil.Ok(t, err)
+		}
+
+		err = w.Close()
+		testutil.Ok(t, err)
+	}
+
+	// Replay the WAL. Should get 9 records.
+	{
+		sr, err := NewSegmentsReader(dir)
+		testutil.Ok(t, err)
+
+		reader := NewReader(sr)
+		i := 0
+		for ; i < 9 && reader.Next(); i++ {
+			testutil.Equals(t, recordSize, len(reader.Record()))
+		}
+		testutil.Equals(t, 9, i, "wrong number of records")
+		testutil.Assert(t, !reader.Next(), "unexpected record")
+		testutil.Equals(t, nil, reader.Err())
+		sr.Close()
+	}
+}
+
+// TestClose ensures that calling Close more than once doesn't panic and doesn't block.
+func TestClose(t *testing.T) {
+	dir, err := ioutil.TempDir("", "wal_repair")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+	w, err := NewSize(nil, nil, dir, pageSize, false)
+	testutil.Ok(t, err)
+	testutil.Ok(t, w.Close())
+	testutil.NotOk(t, w.Close())
+}
+
+func TestSegmentMetric(t *testing.T) {
+	var (
+		segmentSize = pageSize
+		recordSize  = (pageSize / 2) - recordHeaderSize
+	)
+
+	dir, err := ioutil.TempDir("", "segment_metric")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+	w, err := NewSize(nil, nil, dir, segmentSize, false)
+	testutil.Ok(t, err)
+
+	initialSegment := client_testutil.ToFloat64(w.currentSegment)
+
+	// Write 3 records, each of which is half the segment size, meaning we should rotate to the next segment.
+	for i := 0; i < 3; i++ {
+		buf := make([]byte, recordSize)
+		_, err := rand.Read(buf)
+		testutil.Ok(t, err)
+
+		err = w.Log(buf)
+		testutil.Ok(t, err)
+	}
+	testutil.Assert(t, client_testutil.ToFloat64(w.currentSegment) == initialSegment+1, "segment metric did not increment after segment rotation")
+	testutil.Ok(t, w.Close())
+}
+
+func TestCompression(t *testing.T) {
+	boostrap := func(compressed bool) string {
+		const (
+			segmentSize = pageSize
+			recordSize  = (pageSize / 2) - recordHeaderSize
+			records     = 100
+		)
+
+		dirPath, err := ioutil.TempDir("", fmt.Sprintf("TestCompression_%t", compressed))
+		testutil.Ok(t, err)
+
+		w, err := NewSize(nil, nil, dirPath, segmentSize, compressed)
+		testutil.Ok(t, err)
+
+		buf := make([]byte, recordSize)
+		for i := 0; i < records; i++ {
+			testutil.Ok(t, w.Log(buf))
+		}
+		testutil.Ok(t, w.Close())
+
+		return dirPath
+	}
+
+	dirCompressed := boostrap(true)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dirCompressed))
+	}()
+	dirUnCompressed := boostrap(false)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dirUnCompressed))
+	}()
+
+	uncompressedSize := testutil.DirSize(t, dirUnCompressed)
+	compressedSize := testutil.DirSize(t, dirCompressed)
+
+	testutil.Assert(t, float64(uncompressedSize)*0.75 > float64(compressedSize), "Compressing zeroes should save at least 25%% space - uncompressedSize: %d, compressedSize: %d", uncompressedSize, compressedSize)
+}
+
+func BenchmarkWAL_LogBatched(b *testing.B) {
+	for _, compress := range []bool{true, false} {
+		b.Run(fmt.Sprintf("compress=%t", compress), func(b *testing.B) {
+			dir, err := ioutil.TempDir("", "bench_logbatch")
+			testutil.Ok(b, err)
+			defer func() {
+				testutil.Ok(b, os.RemoveAll(dir))
+			}()
+
+			w, err := New(nil, nil, dir, compress)
+			testutil.Ok(b, err)
+			defer w.Close()
+
+			var buf [2048]byte
+			var recs [][]byte
+			b.SetBytes(2048)
+
+			for i := 0; i < b.N; i++ {
+				recs = append(recs, buf[:])
+				if len(recs) < 1000 {
+					continue
+				}
+				err := w.Log(recs...)
+				testutil.Ok(b, err)
+				recs = recs[:0]
+			}
+			// Stop timer to not count fsync time on close.
+			// If it's counted batched vs. single benchmarks are very similar but
+			// do not show burst throughput well.
+			b.StopTimer()
+		})
+	}
+}
+
+func BenchmarkWAL_Log(b *testing.B) {
+	for _, compress := range []bool{true, false} {
+		b.Run(fmt.Sprintf("compress=%t", compress), func(b *testing.B) {
+			dir, err := ioutil.TempDir("", "bench_logsingle")
+			testutil.Ok(b, err)
+			defer func() {
+				testutil.Ok(b, os.RemoveAll(dir))
+			}()
+
+			w, err := New(nil, nil, dir, compress)
+			testutil.Ok(b, err)
+			defer w.Close()
+
+			var buf [2048]byte
+			b.SetBytes(2048)
+
+			for i := 0; i < b.N; i++ {
+				err := w.Log(buf[:])
+				testutil.Ok(b, err)
+			}
+			// Stop timer to not count fsync time on close.
+			// If it's counted batched vs. single benchmarks are very similar but
+			// do not show burst throughput well.
+			b.StopTimer()
+		})
+	}
+}
--- a/tsdb/wal_test.go
+++ b/tsdb/wal_test.go
@ -0,0 +1,566 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !windows
+
+package tsdb
+
+import (
+	"encoding/binary"
+	"io"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"path"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/go-kit/kit/log"
+	"github.com/prometheus/tsdb/fileutil"
+	"github.com/prometheus/tsdb/labels"
+	"github.com/prometheus/tsdb/testutil"
+	"github.com/prometheus/tsdb/wal"
+)
+
+func TestSegmentWAL_cut(t *testing.T) {
+	tmpdir, err := ioutil.TempDir("", "test_wal_cut")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(tmpdir))
+	}()
+
+	// This calls cut() implicitly the first time without a previous tail.
+	w, err := OpenSegmentWAL(tmpdir, nil, 0, nil)
+	testutil.Ok(t, err)
+
+	testutil.Ok(t, w.write(WALEntrySeries, 1, []byte("Hello World!!")))
+
+	testutil.Ok(t, w.cut())
+
+	// Cutting creates a new file.
+	testutil.Equals(t, 2, len(w.files))
+
+	testutil.Ok(t, w.write(WALEntrySeries, 1, []byte("Hello World!!")))
+
+	testutil.Ok(t, w.Close())
+
+	for _, of := range w.files {
+		f, err := os.Open(of.Name())
+		testutil.Ok(t, err)
+
+		// Verify header data.
+		metab := make([]byte, 8)
+		_, err = f.Read(metab)
+		testutil.Ok(t, err)
+		testutil.Equals(t, WALMagic, binary.BigEndian.Uint32(metab[:4]))
+		testutil.Equals(t, WALFormatDefault, metab[4])
+
+		// We cannot actually check for correct pre-allocation as it is
+		// optional per filesystem and handled transparently.
+		et, flag, b, err := newWALReader(nil, nil).entry(f)
+		testutil.Ok(t, err)
+		testutil.Equals(t, WALEntrySeries, et)
+		testutil.Equals(t, byte(walSeriesSimple), flag)
+		testutil.Equals(t, []byte("Hello World!!"), b)
+	}
+}
+
+func TestSegmentWAL_Truncate(t *testing.T) {
+	const (
+		numMetrics = 20000
+		batch      = 100
+	)
+	series, err := labels.ReadLabels(filepath.Join("testdata", "20kseries.json"), numMetrics)
+	testutil.Ok(t, err)
+
+	dir, err := ioutil.TempDir("", "test_wal_log_truncate")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	w, err := OpenSegmentWAL(dir, nil, 0, nil)
+	testutil.Ok(t, err)
+	w.segmentSize = 10000
+
+	for i := 0; i < numMetrics; i += batch {
+		var rs []RefSeries
+
+		for j, s := range series[i : i+batch] {
+			rs = append(rs, RefSeries{Labels: s, Ref: uint64(i+j) + 1})
+		}
+		err := w.LogSeries(rs)
+		testutil.Ok(t, err)
+	}
+
+	// We mark the 2nd half of the files with a min timestamp that should discard
+	// them from the selection of compactable files.
+	for i, f := range w.files[len(w.files)/2:] {
+		f.maxTime = int64(1000 + i)
+	}
+	// All series in those files must be preserved regarding of the provided postings list.
+	boundarySeries := w.files[len(w.files)/2].minSeries
+
+	// We truncate while keeping every 2nd series.
+	keep := map[uint64]struct{}{}
+	for i := 1; i <= numMetrics; i += 2 {
+		keep[uint64(i)] = struct{}{}
+	}
+	keepf := func(id uint64) bool {
+		_, ok := keep[id]
+		return ok
+	}
+
+	err = w.Truncate(1000, keepf)
+	testutil.Ok(t, err)
+
+	var expected []RefSeries
+
+	for i := 1; i <= numMetrics; i++ {
+		if i%2 == 1 || uint64(i) >= boundarySeries {
+			expected = append(expected, RefSeries{Ref: uint64(i), Labels: series[i-1]})
+		}
+	}
+
+	// Call Truncate once again to see whether we can read the written file without
+	// creating a new WAL.
+	err = w.Truncate(1000, keepf)
+	testutil.Ok(t, err)
+	testutil.Ok(t, w.Close())
+
+	// The same again with a new WAL.
+	w, err = OpenSegmentWAL(dir, nil, 0, nil)
+	testutil.Ok(t, err)
+
+	var readSeries []RefSeries
+	r := w.Reader()
+
+	testutil.Ok(t, r.Read(func(s []RefSeries) {
+		readSeries = append(readSeries, s...)
+	}, nil, nil))
+
+	testutil.Equals(t, expected, readSeries)
+}
+
+// Symmetrical test of reading and writing to the WAL via its main interface.
+func TestSegmentWAL_Log_Restore(t *testing.T) {
+	const (
+		numMetrics = 50
+		iterations = 5
+		stepSize   = 5
+	)
+	// Generate testing data. It does not make semantical sense but
+	// for the purpose of this test.
+	series, err := labels.ReadLabels(filepath.Join("testdata", "20kseries.json"), numMetrics)
+	testutil.Ok(t, err)
+
+	dir, err := ioutil.TempDir("", "test_wal_log_restore")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	var (
+		recordedSeries  [][]RefSeries
+		recordedSamples [][]RefSample
+		recordedDeletes [][]Stone
+	)
+	var totalSamples int
+
+	// Open WAL a bunch of times, validate all previous data can be read,
+	// write more data to it, close it.
+	for k := 0; k < numMetrics; k += numMetrics / iterations {
+		w, err := OpenSegmentWAL(dir, nil, 0, nil)
+		testutil.Ok(t, err)
+
+		// Set smaller segment size so we can actually write several files.
+		w.segmentSize = 1000 * 1000
+
+		r := w.Reader()
+
+		var (
+			resultSeries  [][]RefSeries
+			resultSamples [][]RefSample
+			resultDeletes [][]Stone
+		)
+
+		serf := func(series []RefSeries) {
+			if len(series) > 0 {
+				clsets := make([]RefSeries, len(series))
+				copy(clsets, series)
+				resultSeries = append(resultSeries, clsets)
+			}
+		}
+		smplf := func(smpls []RefSample) {
+			if len(smpls) > 0 {
+				csmpls := make([]RefSample, len(smpls))
+				copy(csmpls, smpls)
+				resultSamples = append(resultSamples, csmpls)
+			}
+		}
+
+		delf := func(stones []Stone) {
+			if len(stones) > 0 {
+				cst := make([]Stone, len(stones))
+				copy(cst, stones)
+				resultDeletes = append(resultDeletes, cst)
+			}
+		}
+
+		testutil.Ok(t, r.Read(serf, smplf, delf))
+
+		testutil.Equals(t, recordedSamples, resultSamples)
+		testutil.Equals(t, recordedSeries, resultSeries)
+		testutil.Equals(t, recordedDeletes, resultDeletes)
+
+		series := series[k : k+(numMetrics/iterations)]
+
+		// Insert in batches and generate different amounts of samples for each.
+		for i := 0; i < len(series); i += stepSize {
+			var samples []RefSample
+			var stones []Stone
+
+			for j := 0; j < i*10; j++ {
+				samples = append(samples, RefSample{
+					Ref: uint64(j % 10000),
+					T:   int64(j * 2),
+					V:   rand.Float64(),
+				})
+			}
+
+			for j := 0; j < i*20; j++ {
+				ts := rand.Int63()
+				stones = append(stones, Stone{rand.Uint64(), Intervals{{ts, ts + rand.Int63n(10000)}}})
+			}
+
+			lbls := series[i : i+stepSize]
+			series := make([]RefSeries, 0, len(series))
+			for j, l := range lbls {
+				series = append(series, RefSeries{
+					Ref:    uint64(i + j),
+					Labels: l,
+				})
+			}
+
+			testutil.Ok(t, w.LogSeries(series))
+			testutil.Ok(t, w.LogSamples(samples))
+			testutil.Ok(t, w.LogDeletes(stones))
+
+			if len(lbls) > 0 {
+				recordedSeries = append(recordedSeries, series)
+			}
+			if len(samples) > 0 {
+				recordedSamples = append(recordedSamples, samples)
+				totalSamples += len(samples)
+			}
+			if len(stones) > 0 {
+				recordedDeletes = append(recordedDeletes, stones)
+			}
+		}
+
+		testutil.Ok(t, w.Close())
+	}
+}
+
+func TestWALRestoreCorrupted_invalidSegment(t *testing.T) {
+	dir, err := ioutil.TempDir("", "test_wal_log_restore")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	wal, err := OpenSegmentWAL(dir, nil, 0, nil)
+	testutil.Ok(t, err)
+
+	_, err = wal.createSegmentFile(filepath.Join(dir, "000000"))
+	testutil.Ok(t, err)
+	f, err := wal.createSegmentFile(filepath.Join(dir, "000001"))
+	testutil.Ok(t, err)
+	f2, err := wal.createSegmentFile(filepath.Join(dir, "000002"))
+	testutil.Ok(t, err)
+	testutil.Ok(t, f2.Close())
+
+	// Make header of second segment invalid.
+	_, err = f.WriteAt([]byte{1, 2, 3, 4}, 0)
+	testutil.Ok(t, err)
+	testutil.Ok(t, f.Close())
+
+	testutil.Ok(t, wal.Close())
+
+	_, err = OpenSegmentWAL(dir, log.NewLogfmtLogger(os.Stderr), 0, nil)
+	testutil.Ok(t, err)
+
+	fns, err := fileutil.ReadDir(dir)
+	testutil.Ok(t, err)
+	testutil.Equals(t, []string{"000000"}, fns)
+}
+
+// Test reading from a WAL that has been corrupted through various means.
+func TestWALRestoreCorrupted(t *testing.T) {
+	cases := []struct {
+		name string
+		f    func(*testing.T, *SegmentWAL)
+	}{
+		{
+			name: "truncate_checksum",
+			f: func(t *testing.T, w *SegmentWAL) {
+				f, err := os.OpenFile(w.files[0].Name(), os.O_WRONLY, 0666)
+				testutil.Ok(t, err)
+				defer f.Close()
+
+				off, err := f.Seek(0, io.SeekEnd)
+				testutil.Ok(t, err)
+
+				testutil.Ok(t, f.Truncate(off-1))
+			},
+		},
+		{
+			name: "truncate_body",
+			f: func(t *testing.T, w *SegmentWAL) {
+				f, err := os.OpenFile(w.files[0].Name(), os.O_WRONLY, 0666)
+				testutil.Ok(t, err)
+				defer f.Close()
+
+				off, err := f.Seek(0, io.SeekEnd)
+				testutil.Ok(t, err)
+
+				testutil.Ok(t, f.Truncate(off-8))
+			},
+		},
+		{
+			name: "body_content",
+			f: func(t *testing.T, w *SegmentWAL) {
+				f, err := os.OpenFile(w.files[0].Name(), os.O_WRONLY, 0666)
+				testutil.Ok(t, err)
+				defer f.Close()
+
+				off, err := f.Seek(0, io.SeekEnd)
+				testutil.Ok(t, err)
+
+				// Write junk before checksum starts.
+				_, err = f.WriteAt([]byte{1, 2, 3, 4}, off-8)
+				testutil.Ok(t, err)
+			},
+		},
+		{
+			name: "checksum",
+			f: func(t *testing.T, w *SegmentWAL) {
+				f, err := os.OpenFile(w.files[0].Name(), os.O_WRONLY, 0666)
+				testutil.Ok(t, err)
+				defer f.Close()
+
+				off, err := f.Seek(0, io.SeekEnd)
+				testutil.Ok(t, err)
+
+				// Write junk into checksum
+				_, err = f.WriteAt([]byte{1, 2, 3, 4}, off-4)
+				testutil.Ok(t, err)
+			},
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			// Generate testing data. It does not make semantical sense but
+			// for the purpose of this test.
+			dir, err := ioutil.TempDir("", "test_corrupted")
+			testutil.Ok(t, err)
+			defer func() {
+				testutil.Ok(t, os.RemoveAll(dir))
+			}()
+
+			w, err := OpenSegmentWAL(dir, nil, 0, nil)
+			testutil.Ok(t, err)
+
+			testutil.Ok(t, w.LogSamples([]RefSample{{T: 1, V: 2}}))
+			testutil.Ok(t, w.LogSamples([]RefSample{{T: 2, V: 3}}))
+
+			testutil.Ok(t, w.cut())
+
+			// Sleep 2 seconds to avoid error where cut and test "cases" function may write or
+			// truncate the file out of orders as "cases" are not synchronized with cut.
+			// Hopefully cut will complete by 2 seconds.
+			time.Sleep(2 * time.Second)
+
+			testutil.Ok(t, w.LogSamples([]RefSample{{T: 3, V: 4}}))
+			testutil.Ok(t, w.LogSamples([]RefSample{{T: 5, V: 6}}))
+
+			testutil.Ok(t, w.Close())
+
+			// cut() truncates and fsyncs the first segment async. If it happens after
+			// the corruption we apply below, the corruption will be overwritten again.
+			// Fire and forget a sync to avoid flakyness.
+			w.files[0].Sync()
+			// Corrupt the second entry in the first file.
+			// After re-opening we must be able to read the first entry
+			// and the rest, including the second file, must be truncated for clean further
+			// writes.
+			c.f(t, w)
+
+			logger := log.NewLogfmtLogger(os.Stderr)
+
+			w2, err := OpenSegmentWAL(dir, logger, 0, nil)
+			testutil.Ok(t, err)
+
+			r := w2.Reader()
+
+			serf := func(l []RefSeries) {
+				testutil.Equals(t, 0, len(l))
+			}
+
+			// Weird hack to check order of reads.
+			i := 0
+			samplf := func(s []RefSample) {
+				if i == 0 {
+					testutil.Equals(t, []RefSample{{T: 1, V: 2}}, s)
+					i++
+				} else {
+					testutil.Equals(t, []RefSample{{T: 99, V: 100}}, s)
+				}
+			}
+
+			testutil.Ok(t, r.Read(serf, samplf, nil))
+
+			testutil.Ok(t, w2.LogSamples([]RefSample{{T: 99, V: 100}}))
+			testutil.Ok(t, w2.Close())
+
+			// We should see the first valid entry and the new one, everything after
+			// is truncated.
+			w3, err := OpenSegmentWAL(dir, logger, 0, nil)
+			testutil.Ok(t, err)
+
+			r = w3.Reader()
+
+			i = 0
+			testutil.Ok(t, r.Read(serf, samplf, nil))
+		})
+	}
+}
+
+func TestMigrateWAL_Empty(t *testing.T) {
+	// The migration proecedure must properly deal with a zero-length segment,
+	// which is valid in the new format.
+	dir, err := ioutil.TempDir("", "walmigrate")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	wdir := path.Join(dir, "wal")
+
+	// Initialize empty WAL.
+	w, err := wal.New(nil, nil, wdir, false)
+	testutil.Ok(t, err)
+	testutil.Ok(t, w.Close())
+
+	testutil.Ok(t, MigrateWAL(nil, wdir))
+}
+
+func TestMigrateWAL_Fuzz(t *testing.T) {
+	dir, err := ioutil.TempDir("", "walmigrate")
+	testutil.Ok(t, err)
+	defer func() {
+		testutil.Ok(t, os.RemoveAll(dir))
+	}()
+
+	wdir := path.Join(dir, "wal")
+
+	// Should pass if no WAL exists yet.
+	testutil.Ok(t, MigrateWAL(nil, wdir))
+
+	oldWAL, err := OpenSegmentWAL(wdir, nil, time.Minute, nil)
+	testutil.Ok(t, err)
+
+	// Write some data.
+	testutil.Ok(t, oldWAL.LogSeries([]RefSeries{
+		{Ref: 100, Labels: labels.FromStrings("abc", "def", "123", "456")},
+		{Ref: 1, Labels: labels.FromStrings("abc", "def2", "1234", "4567")},
+	}))
+	testutil.Ok(t, oldWAL.LogSamples([]RefSample{
+		{Ref: 1, T: 100, V: 200},
+		{Ref: 2, T: 300, V: 400},
+	}))
+	testutil.Ok(t, oldWAL.LogSeries([]RefSeries{
+		{Ref: 200, Labels: labels.FromStrings("xyz", "def", "foo", "bar")},
+	}))
+	testutil.Ok(t, oldWAL.LogSamples([]RefSample{
+		{Ref: 3, T: 100, V: 200},
+		{Ref: 4, T: 300, V: 400},
+	}))
+	testutil.Ok(t, oldWAL.LogDeletes([]Stone{
+		{ref: 1, intervals: []Interval{{100, 200}}},
+	}))
+
+	testutil.Ok(t, oldWAL.Close())
+
+	// Perform migration.
+	testutil.Ok(t, MigrateWAL(nil, wdir))
+
+	w, err := wal.New(nil, nil, wdir, false)
+	testutil.Ok(t, err)
+
+	// We can properly write some new data after migration.
+	var enc RecordEncoder
+	testutil.Ok(t, w.Log(enc.Samples([]RefSample{
+		{Ref: 500, T: 1, V: 1},
+	}, nil)))
+
+	testutil.Ok(t, w.Close())
+
+	// Read back all data.
+	sr, err := wal.NewSegmentsReader(wdir)
+	testutil.Ok(t, err)
+
+	r := wal.NewReader(sr)
+	var res []interface{}
+	var dec RecordDecoder
+
+	for r.Next() {
+		rec := r.Record()
+
+		switch dec.Type(rec) {
+		case RecordSeries:
+			s, err := dec.Series(rec, nil)
+			testutil.Ok(t, err)
+			res = append(res, s)
+		case RecordSamples:
+			s, err := dec.Samples(rec, nil)
+			testutil.Ok(t, err)
+			res = append(res, s)
+		case RecordTombstones:
+			s, err := dec.Tombstones(rec, nil)
+			testutil.Ok(t, err)
+			res = append(res, s)
+		default:
+			t.Fatalf("unknown record type %d", dec.Type(rec))
+		}
+	}
+	testutil.Ok(t, r.Err())
+
+	testutil.Equals(t, []interface{}{
+		[]RefSeries{
+			{Ref: 100, Labels: labels.FromStrings("abc", "def", "123", "456")},
+			{Ref: 1, Labels: labels.FromStrings("abc", "def2", "1234", "4567")},
+		},
+		[]RefSample{{Ref: 1, T: 100, V: 200}, {Ref: 2, T: 300, V: 400}},
+		[]RefSeries{
+			{Ref: 200, Labels: labels.FromStrings("xyz", "def", "foo", "bar")},
+		},
+		[]RefSample{{Ref: 3, T: 100, V: 200}, {Ref: 4, T: 300, V: 400}},
+		[]Stone{{ref: 1, intervals: []Interval{{100, 200}}}},
+		[]RefSample{{Ref: 500, T: 1, V: 1}},
+	}, res)
+
+	// Migrating an already migrated WAL shouldn't do anything.
+	testutil.Ok(t, MigrateWAL(nil, wdir))
+}