mirror of
https://github.com/prometheus/prometheus.git
synced 2024-11-10 07:34:04 -08:00
Fix pages and index package
This commit is contained in:
parent
af36c89178
commit
342aa82505
1
index
1
index
|
@ -1 +0,0 @@
|
|||
Subproject commit be0456338d4d41ce559ba8371c3c6943df49c863
|
201
index/LICENSE
Normal file
201
index/LICENSE
Normal file
|
@ -0,0 +1,201 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
3
index/cmd/index/.gitignore
vendored
Normal file
3
index/cmd/index/.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
index
|
||||
benchout
|
||||
testdata*
|
14
index/cmd/index/Makefile
Normal file
14
index/cmd/index/Makefile
Normal file
|
@ -0,0 +1,14 @@
|
|||
all: bench svg
|
||||
|
||||
bench: build
|
||||
@echo ">> running benchmark"
|
||||
@./tindex bench write testdata
|
||||
|
||||
build:
|
||||
@go build .
|
||||
|
||||
svg:
|
||||
@echo ">> create svgs"
|
||||
@go tool pprof -svg ./tindex benchout/cpu.prof > benchout/cpuprof.svg
|
||||
@go tool pprof -svg ./tindex benchout/mem.prof > benchout/memprof.svg
|
||||
@go tool pprof -svg ./tindex benchout/block.prof > benchout/blockprof.svg
|
253
index/cmd/index/main.go
Normal file
253
index/cmd/index/main.go
Normal file
|
@ -0,0 +1,253 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"runtime/pprof"
|
||||
"time"
|
||||
|
||||
"github.com/fabxc/tsdb/index"
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
"github.com/prometheus/common/expfmt"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
func main() {
|
||||
root := &cobra.Command{
|
||||
Use: "index",
|
||||
Short: "CLI tool for index",
|
||||
}
|
||||
|
||||
root.AddCommand(
|
||||
NewBenchCommand(),
|
||||
)
|
||||
|
||||
root.Execute()
|
||||
}
|
||||
|
||||
func NewBenchCommand() *cobra.Command {
|
||||
c := &cobra.Command{
|
||||
Use: "bench",
|
||||
Short: "run benchmarks",
|
||||
}
|
||||
c.AddCommand(NewBenchWriteCommand())
|
||||
|
||||
return c
|
||||
}
|
||||
|
||||
type writeBenchmark struct {
|
||||
outPath string
|
||||
cleanup bool
|
||||
|
||||
cpuprof *os.File
|
||||
memprof *os.File
|
||||
blockprof *os.File
|
||||
}
|
||||
|
||||
func NewBenchWriteCommand() *cobra.Command {
|
||||
var wb writeBenchmark
|
||||
c := &cobra.Command{
|
||||
Use: "write <file>",
|
||||
Short: "run a write performance benchmark",
|
||||
Run: wb.run,
|
||||
}
|
||||
c.PersistentFlags().StringVar(&wb.outPath, "out", "benchout/", "set the output path")
|
||||
|
||||
return c
|
||||
}
|
||||
|
||||
func (b *writeBenchmark) run(cmd *cobra.Command, args []string) {
|
||||
if len(args) != 1 {
|
||||
exitWithError(fmt.Errorf("missing file argument"))
|
||||
}
|
||||
if b.outPath == "" {
|
||||
dir, err := ioutil.TempDir("", "index_bench")
|
||||
if err != nil {
|
||||
exitWithError(err)
|
||||
}
|
||||
b.outPath = dir
|
||||
b.cleanup = true
|
||||
}
|
||||
if err := os.RemoveAll(b.outPath); err != nil {
|
||||
exitWithError(err)
|
||||
}
|
||||
if err := os.MkdirAll(b.outPath, 0777); err != nil {
|
||||
exitWithError(err)
|
||||
}
|
||||
|
||||
var docs []*InsertDoc
|
||||
|
||||
measureTime("readData", func() {
|
||||
f, err := os.Open(args[0])
|
||||
if err != nil {
|
||||
exitWithError(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
docs, err = readPrometheusLabels(f)
|
||||
if err != nil {
|
||||
exitWithError(err)
|
||||
}
|
||||
})
|
||||
|
||||
dir := filepath.Join(b.outPath, "ix")
|
||||
|
||||
ix, err := index.Open(dir, nil)
|
||||
if err != nil {
|
||||
exitWithError(err)
|
||||
}
|
||||
defer func() {
|
||||
ix.Close()
|
||||
reportSize(dir)
|
||||
if b.cleanup {
|
||||
os.RemoveAll(b.outPath)
|
||||
}
|
||||
}()
|
||||
|
||||
measureTime("indexData", func() {
|
||||
b.startProfiling()
|
||||
indexDocs(ix, docs, 100000)
|
||||
indexDocs(ix, docs, 100000)
|
||||
indexDocs(ix, docs, 100000)
|
||||
indexDocs(ix, docs, 100000)
|
||||
b.stopProfiling()
|
||||
})
|
||||
}
|
||||
|
||||
func (b *writeBenchmark) startProfiling() {
|
||||
var err error
|
||||
|
||||
// Start CPU profiling.
|
||||
b.cpuprof, err = os.Create(filepath.Join(b.outPath, "cpu.prof"))
|
||||
if err != nil {
|
||||
exitWithError(fmt.Errorf("bench: could not create cpu profile: %v\n", err))
|
||||
}
|
||||
pprof.StartCPUProfile(b.cpuprof)
|
||||
|
||||
// Start memory profiling.
|
||||
b.memprof, err = os.Create(filepath.Join(b.outPath, "mem.prof"))
|
||||
if err != nil {
|
||||
exitWithError(fmt.Errorf("bench: could not create memory profile: %v\n", err))
|
||||
}
|
||||
runtime.MemProfileRate = 4096
|
||||
|
||||
// Start fatal profiling.
|
||||
b.blockprof, err = os.Create(filepath.Join(b.outPath, "block.prof"))
|
||||
if err != nil {
|
||||
exitWithError(fmt.Errorf("bench: could not create block profile: %v\n", err))
|
||||
}
|
||||
runtime.SetBlockProfileRate(1)
|
||||
}
|
||||
|
||||
func (b *writeBenchmark) stopProfiling() {
|
||||
if b.cpuprof != nil {
|
||||
pprof.StopCPUProfile()
|
||||
b.cpuprof.Close()
|
||||
b.cpuprof = nil
|
||||
}
|
||||
if b.memprof != nil {
|
||||
pprof.Lookup("heap").WriteTo(b.memprof, 0)
|
||||
b.memprof.Close()
|
||||
b.memprof = nil
|
||||
}
|
||||
if b.blockprof != nil {
|
||||
pprof.Lookup("block").WriteTo(b.blockprof, 0)
|
||||
b.blockprof.Close()
|
||||
b.blockprof = nil
|
||||
runtime.SetBlockProfileRate(0)
|
||||
}
|
||||
}
|
||||
|
||||
func indexDocs(ix *index.Index, docs []*InsertDoc, batchSize int) {
|
||||
remDocs := docs[:]
|
||||
var ids []index.DocID
|
||||
|
||||
for len(remDocs) > 0 {
|
||||
n := batchSize
|
||||
if n > len(remDocs) {
|
||||
n = len(remDocs)
|
||||
}
|
||||
|
||||
b, err := ix.Batch()
|
||||
if err != nil {
|
||||
exitWithError(err)
|
||||
}
|
||||
for _, d := range remDocs[:n] {
|
||||
id := b.Add(d.Terms)
|
||||
ids = append(ids, id)
|
||||
}
|
||||
if err := b.Commit(); err != nil {
|
||||
exitWithError(err)
|
||||
}
|
||||
|
||||
remDocs = remDocs[n:]
|
||||
}
|
||||
}
|
||||
|
||||
func reportSize(dir string) {
|
||||
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil || path == dir {
|
||||
return err
|
||||
}
|
||||
fmt.Printf(" > file=%s size=%.03fGiB\n", path[len(dir):], float64(info.Size())/1024/1024/1024)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
exitWithError(err)
|
||||
}
|
||||
}
|
||||
|
||||
func measureTime(stage string, f func()) {
|
||||
fmt.Printf(">> start stage=%s\n", stage)
|
||||
start := time.Now()
|
||||
f()
|
||||
fmt.Printf(">> completed stage=%s duration=%s\n", stage, time.Since(start))
|
||||
}
|
||||
|
||||
type InsertDoc struct {
|
||||
Terms index.Terms
|
||||
}
|
||||
|
||||
func readPrometheusLabels(r io.Reader) ([]*InsertDoc, error) {
|
||||
dec := expfmt.NewDecoder(r, expfmt.FmtProtoText)
|
||||
|
||||
var docs []*InsertDoc
|
||||
var mf dto.MetricFamily
|
||||
|
||||
for {
|
||||
if err := dec.Decode(&mf); err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, m := range mf.GetMetric() {
|
||||
d := &InsertDoc{
|
||||
Terms: make(index.Terms, len(m.GetLabel())+1),
|
||||
}
|
||||
d.Terms[0] = index.Term{
|
||||
Field: "__name__",
|
||||
Val: mf.GetName(),
|
||||
}
|
||||
for i, l := range m.GetLabel() {
|
||||
d.Terms[i+1] = index.Term{
|
||||
Field: l.GetName(),
|
||||
Val: l.GetValue(),
|
||||
}
|
||||
}
|
||||
docs = append(docs, d)
|
||||
}
|
||||
}
|
||||
|
||||
return docs, nil
|
||||
}
|
||||
|
||||
func exitWithError(err error) {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
os.Exit(1)
|
||||
}
|
206
index/coding.go
Normal file
206
index/coding.go
Normal file
|
@ -0,0 +1,206 @@
|
|||
package index
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"io"
|
||||
"sync"
|
||||
|
||||
"github.com/boltdb/bolt"
|
||||
)
|
||||
|
||||
var encpool buffers
|
||||
|
||||
type buffers struct {
|
||||
pool sync.Pool
|
||||
}
|
||||
|
||||
func (b *buffers) get(l int) []byte {
|
||||
x := b.pool.Get()
|
||||
if x == nil {
|
||||
return make([]byte, l)
|
||||
}
|
||||
buf := x.([]byte)
|
||||
if cap(buf) < l {
|
||||
return make([]byte, l)
|
||||
}
|
||||
return buf[:l]
|
||||
}
|
||||
|
||||
func (b *buffers) getZero(l int) []byte {
|
||||
buf := b.get(l)
|
||||
for i := range buf {
|
||||
buf[i] = 0
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func (b *buffers) put(buf []byte) {
|
||||
b.pool.Put(buf)
|
||||
}
|
||||
|
||||
func (b *buffers) bucketPut(bkt *bolt.Bucket, k, v []byte) error {
|
||||
err := bkt.Put(k, v)
|
||||
b.put(k)
|
||||
return err
|
||||
}
|
||||
|
||||
func (b *buffers) bucketGet(bkt *bolt.Bucket, k []byte) []byte {
|
||||
v := bkt.Get(k)
|
||||
b.put(k)
|
||||
return v
|
||||
}
|
||||
|
||||
func (b *buffers) uint64be(x uint64) []byte {
|
||||
buf := b.get(8)
|
||||
binary.BigEndian.PutUint64(buf, x)
|
||||
return buf
|
||||
}
|
||||
|
||||
func (b *buffers) uvarint(x uint64) []byte {
|
||||
buf := b.get(binary.MaxVarintLen64)
|
||||
return buf[:binary.PutUvarint(buf, x)]
|
||||
}
|
||||
|
||||
type txbuffs struct {
|
||||
buffers *buffers
|
||||
done [][]byte
|
||||
}
|
||||
|
||||
func (b *txbuffs) get(l int) []byte {
|
||||
buf := b.buffers.get(l)
|
||||
b.done = append(b.done, buf)
|
||||
return buf
|
||||
}
|
||||
|
||||
func (b *txbuffs) getZero(l int) []byte {
|
||||
buf := b.buffers.getZero(l)
|
||||
b.done = append(b.done, buf)
|
||||
return buf
|
||||
}
|
||||
|
||||
func (b *txbuffs) release() {
|
||||
for _, buf := range b.done {
|
||||
b.buffers.put(buf)
|
||||
}
|
||||
}
|
||||
|
||||
func (b *txbuffs) put(buf []byte) {
|
||||
b.done = append(b.done, buf)
|
||||
}
|
||||
|
||||
func (b *txbuffs) uint64be(x uint64) []byte {
|
||||
buf := b.get(8)
|
||||
binary.BigEndian.PutUint64(buf, x)
|
||||
return buf
|
||||
}
|
||||
|
||||
func (b *txbuffs) uvarint(x uint64) []byte {
|
||||
buf := b.get(binary.MaxVarintLen64)
|
||||
return buf[:binary.PutUvarint(buf, x)]
|
||||
}
|
||||
|
||||
// reuse of buffers
|
||||
var pagePool sync.Pool
|
||||
|
||||
// getBuf returns a buffer from the pool. The length of the returned slice is l.
|
||||
func getPage(l int) []byte {
|
||||
x := pagePool.Get()
|
||||
if x == nil {
|
||||
return make([]byte, l)
|
||||
}
|
||||
buf := x.([]byte)
|
||||
if cap(buf) < l {
|
||||
return make([]byte, l)
|
||||
}
|
||||
return buf[:l]
|
||||
}
|
||||
|
||||
// putBuf returns a buffer to the pool.
|
||||
func putPage(buf []byte) {
|
||||
pagePool.Put(buf)
|
||||
}
|
||||
|
||||
// bufPool is a pool for staging buffers. Using a pool allows concurrency-safe
|
||||
// reuse of buffers
|
||||
var bufPool sync.Pool
|
||||
|
||||
// getBuf returns a buffer from the pool. The length of the returned slice is l.
|
||||
func getBuf(l int) []byte {
|
||||
x := bufPool.Get()
|
||||
if x == nil {
|
||||
return make([]byte, l)
|
||||
}
|
||||
buf := x.([]byte)
|
||||
if cap(buf) < l {
|
||||
return make([]byte, l)
|
||||
}
|
||||
return buf[:l]
|
||||
}
|
||||
|
||||
// putBuf returns a buffer to the pool.
|
||||
func putBuf(buf []byte) {
|
||||
bufPool.Put(buf)
|
||||
}
|
||||
|
||||
func encodeUint64(x uint64) []byte {
|
||||
buf := getBuf(8)
|
||||
binary.BigEndian.PutUint64(buf, x)
|
||||
return buf
|
||||
}
|
||||
|
||||
func decodeUint64(buf []byte) uint64 {
|
||||
return binary.BigEndian.Uint64(buf)
|
||||
}
|
||||
|
||||
func writeUvarint(w io.ByteWriter, x uint64) (i int, err error) {
|
||||
for x >= 0x80 {
|
||||
if err = w.WriteByte(byte(x) | 0x80); err != nil {
|
||||
return i, err
|
||||
}
|
||||
x >>= 7
|
||||
i++
|
||||
}
|
||||
if err = w.WriteByte(byte(x)); err != nil {
|
||||
return i, err
|
||||
}
|
||||
return i + 1, err
|
||||
}
|
||||
|
||||
func writeVarint(w io.ByteWriter, x int64) (i int, err error) {
|
||||
ux := uint64(x) << 1
|
||||
if x < 0 {
|
||||
ux = ^ux
|
||||
}
|
||||
return writeUvarint(w, ux)
|
||||
}
|
||||
|
||||
func readUvarint(r io.ByteReader) (uint64, int, error) {
|
||||
var (
|
||||
x uint64
|
||||
s uint
|
||||
)
|
||||
for i := 0; ; i++ {
|
||||
b, err := r.ReadByte()
|
||||
if err != nil {
|
||||
return x, i, err
|
||||
}
|
||||
if b < 0x80 {
|
||||
if i > 9 || i == 9 && b > 1 {
|
||||
return x, i + 1, errors.New("varint overflows a 64-bit integer")
|
||||
}
|
||||
return x | uint64(b)<<s, i + 1, nil
|
||||
}
|
||||
x |= uint64(b&0x7f) << s
|
||||
s += 7
|
||||
}
|
||||
}
|
||||
|
||||
func readVarint(r io.ByteReader) (int64, int, error) {
|
||||
ux, n, err := readUvarint(r)
|
||||
x := int64(ux >> 1)
|
||||
if ux&1 != 0 {
|
||||
x = ^x
|
||||
}
|
||||
return x, n, err
|
||||
}
|
716
index/index.go
Normal file
716
index/index.go
Normal file
|
@ -0,0 +1,716 @@
|
|||
package index
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"encoding/gob"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sync"
|
||||
|
||||
"github.com/boltdb/bolt"
|
||||
"github.com/fabxc/tsdb/pages"
|
||||
)
|
||||
|
||||
var (
|
||||
errOutOfOrder = errors.New("out of order")
|
||||
errNotFound = errors.New("not found")
|
||||
)
|
||||
|
||||
// Options for an Index.
|
||||
type Options struct {
|
||||
}
|
||||
|
||||
// DefaultOptions used for opening a new index.
|
||||
var DefaultOptions = &Options{}
|
||||
|
||||
// Index is a fully persistent inverted index of documents with any number of fields
|
||||
// that map to exactly one term.
|
||||
type Index struct {
|
||||
pbuf *pages.DB
|
||||
bolt *bolt.DB
|
||||
meta *meta
|
||||
|
||||
rwlock sync.Mutex
|
||||
}
|
||||
|
||||
// Open returns an index located in the given path. If none exists a new
|
||||
// one is created.
|
||||
func Open(path string, opts *Options) (*Index, error) {
|
||||
if opts == nil {
|
||||
opts = DefaultOptions
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(path, 0777); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
bdb, err := bolt.Open(filepath.Join(path, "kv"), 0666, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pdb, err := pages.Open(filepath.Join(path, "pb"), 0666, &pages.Options{
|
||||
PageSize: pageSize,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ix := &Index{
|
||||
bolt: bdb,
|
||||
pbuf: pdb,
|
||||
meta: &meta{},
|
||||
}
|
||||
if err := ix.bolt.Update(ix.init); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return ix, nil
|
||||
}
|
||||
|
||||
// Close closes the index.
|
||||
func (ix *Index) Close() error {
|
||||
err0 := ix.pbuf.Close()
|
||||
err1 := ix.bolt.Close()
|
||||
if err0 != nil {
|
||||
return err0
|
||||
}
|
||||
return err1
|
||||
}
|
||||
|
||||
var (
|
||||
bktMeta = []byte("meta")
|
||||
bktDocs = []byte("docs")
|
||||
bktTerms = []byte("terms")
|
||||
bktTermIDs = []byte("term_ids")
|
||||
bktSkiplist = []byte("skiplist")
|
||||
|
||||
keyMeta = []byte("meta")
|
||||
)
|
||||
|
||||
func (ix *Index) init(tx *bolt.Tx) error {
|
||||
// Ensure all buckets exist. Any other index methods assume
|
||||
// that these buckets exist and may panic otherwise.
|
||||
for _, bn := range [][]byte{
|
||||
bktMeta, bktTerms, bktTermIDs, bktDocs, bktSkiplist,
|
||||
} {
|
||||
if _, err := tx.CreateBucketIfNotExists(bn); err != nil {
|
||||
return fmt.Errorf("create bucket %q failed: %s", string(bn), err)
|
||||
}
|
||||
}
|
||||
|
||||
// Read the meta state if the index was already initialized.
|
||||
mbkt := tx.Bucket(bktMeta)
|
||||
if v := mbkt.Get(keyMeta); v != nil {
|
||||
if err := ix.meta.read(v); err != nil {
|
||||
return fmt.Errorf("decoding meta failed: %s", err)
|
||||
}
|
||||
} else {
|
||||
// Index not initialized yet, set up meta information.
|
||||
ix.meta = &meta{
|
||||
LastDocID: 0,
|
||||
LastTermID: 0,
|
||||
}
|
||||
v, err := ix.meta.bytes()
|
||||
if err != nil {
|
||||
return fmt.Errorf("encoding meta failed: %s", err)
|
||||
}
|
||||
if err := mbkt.Put(keyMeta, v); err != nil {
|
||||
return fmt.Errorf("creating meta failed: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Querier starts a new query session against the index.
|
||||
func (ix *Index) Querier() (*Querier, error) {
|
||||
kvtx, err := ix.bolt.Begin(false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pbtx, err := ix.pbuf.Begin(false)
|
||||
if err != nil {
|
||||
kvtx.Rollback()
|
||||
return nil, err
|
||||
}
|
||||
return &Querier{
|
||||
kvtx: kvtx,
|
||||
pbtx: pbtx,
|
||||
// TODO(fabxc): consider getting these buckets lazily.
|
||||
termBkt: kvtx.Bucket(bktTerms),
|
||||
termidBkt: kvtx.Bucket(bktTermIDs),
|
||||
docBkt: kvtx.Bucket(bktDocs),
|
||||
skiplistBkt: kvtx.Bucket(bktSkiplist),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Querier encapsulates the index for several queries.
|
||||
type Querier struct {
|
||||
kvtx *bolt.Tx
|
||||
pbtx *pages.Tx
|
||||
|
||||
termBkt *bolt.Bucket
|
||||
termidBkt *bolt.Bucket
|
||||
docBkt *bolt.Bucket
|
||||
skiplistBkt *bolt.Bucket
|
||||
}
|
||||
|
||||
// Close closes the underlying index transactions.
|
||||
func (q *Querier) Close() error {
|
||||
err0 := q.pbtx.Rollback()
|
||||
err1 := q.kvtx.Rollback()
|
||||
if err0 != nil {
|
||||
return err0
|
||||
}
|
||||
return err1
|
||||
}
|
||||
|
||||
// Terms returns all terms for the key field matching the provided matcher.
|
||||
// If the matcher is nil, all terms for the field are returned.
|
||||
func (q *Querier) Terms(key string, m Matcher) []string {
|
||||
if m == nil {
|
||||
m = AnyMatcher
|
||||
}
|
||||
return q.termsForMatcher(key, m)
|
||||
}
|
||||
|
||||
// Search returns an iterator over all document IDs that match all
|
||||
// provided matchers.
|
||||
func (q *Querier) Search(key string, m Matcher) (Iterator, error) {
|
||||
tids := q.termIDsForMatcher(key, m)
|
||||
its := make([]Iterator, 0, len(tids))
|
||||
|
||||
for _, t := range tids {
|
||||
it, err := q.postingsIter(t)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
its = append(its, it)
|
||||
}
|
||||
|
||||
if len(its) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
return Merge(its...), nil
|
||||
}
|
||||
|
||||
// postingsIter returns an iterator over the postings list of term t.
|
||||
func (q *Querier) postingsIter(t termid) (Iterator, error) {
|
||||
b := q.skiplistBkt.Bucket(t.bytes())
|
||||
if b == nil {
|
||||
return nil, errNotFound
|
||||
}
|
||||
|
||||
it := &skippingIterator{
|
||||
skiplist: &boltSkiplistCursor{
|
||||
k: uint64(t),
|
||||
c: b.Cursor(),
|
||||
bkt: b,
|
||||
},
|
||||
iterators: iteratorStoreFunc(func(k uint64) (Iterator, error) {
|
||||
data, err := q.pbtx.Get(k)
|
||||
if err != nil {
|
||||
return nil, errNotFound
|
||||
}
|
||||
// TODO(fabxc): for now, offset is zero, pages have no header
|
||||
// and are always delta encoded.
|
||||
return newPageDelta(data).cursor(), nil
|
||||
}),
|
||||
}
|
||||
|
||||
return it, nil
|
||||
}
|
||||
|
||||
func (q *Querier) termsForMatcher(key string, m Matcher) []string {
|
||||
c := q.termBkt.Cursor()
|
||||
pref := append([]byte(key), 0xff)
|
||||
|
||||
var terms []string
|
||||
// TODO(fabxc): We scan the entire term value range for the field. Improvide this by direct
|
||||
// and prefixed seeks depending on the matcher.
|
||||
for k, _ := c.Seek(pref); bytes.HasPrefix(k, pref); k, _ = c.Next() {
|
||||
if m.Match(string(k[len(pref):])) {
|
||||
terms = append(terms, string(k[len(pref):]))
|
||||
}
|
||||
}
|
||||
return terms
|
||||
}
|
||||
|
||||
func (q *Querier) termIDsForMatcher(key string, m Matcher) termids {
|
||||
c := q.termBkt.Cursor()
|
||||
pref := append([]byte(key), 0xff)
|
||||
|
||||
var ids termids
|
||||
// TODO(fabxc): We scan the entire term value range for the field. Improvide this by direct
|
||||
// and prefixed seeks depending on the matcher.
|
||||
for k, v := c.Seek(pref); bytes.HasPrefix(k, pref); k, v = c.Next() {
|
||||
if m.Match(string(k[len(pref):])) {
|
||||
ids = append(ids, newTermID(v))
|
||||
}
|
||||
}
|
||||
return ids
|
||||
}
|
||||
|
||||
// Doc returns the document with the given ID.
|
||||
func (q *Querier) Doc(id DocID) (Terms, error) {
|
||||
v := q.docBkt.Get(id.bytes())
|
||||
if v == nil {
|
||||
return nil, errNotFound
|
||||
}
|
||||
tids := newTermIDs(v)
|
||||
|
||||
// TODO(fabxc): consider at least a per-session cache for these.
|
||||
terms := make(Terms, len(tids))
|
||||
for i, t := range tids {
|
||||
// TODO(fabxc): is this encode/decode cycle here worth the space savings?
|
||||
// If we stored plain uint64s we can just pass the slice back in.
|
||||
v := q.termidBkt.Get(t.bytes())
|
||||
if v == nil {
|
||||
return nil, fmt.Errorf("term not found")
|
||||
}
|
||||
term, err := newTerm(v)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
terms[i] = term
|
||||
}
|
||||
return terms, nil
|
||||
}
|
||||
|
||||
// Delete removes all documents in the iterator from the index.
|
||||
// It returns the number of deleted documents.
|
||||
func (ix *Index) Delete(it Iterator) (int, error) {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
// Batch starts a new batch against the index.
|
||||
func (ix *Index) Batch() (*Batch, error) {
|
||||
// Lock writes so we can safely pre-allocate term and doc IDs.
|
||||
ix.rwlock.Lock()
|
||||
|
||||
tx, err := ix.bolt.Begin(false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
b := &Batch{
|
||||
ix: ix,
|
||||
tx: tx,
|
||||
meta: &meta{},
|
||||
termBkt: tx.Bucket(bktTerms),
|
||||
termidBkt: tx.Bucket(bktTermIDs),
|
||||
terms: map[Term]*batchTerm{},
|
||||
}
|
||||
*b.meta = *ix.meta
|
||||
return b, nil
|
||||
}
|
||||
|
||||
// meta contains information about the state of the index.
|
||||
type meta struct {
|
||||
LastDocID DocID
|
||||
LastTermID termid
|
||||
}
|
||||
|
||||
// read initilizes the meta from a byte slice.
|
||||
func (m *meta) read(b []byte) error {
|
||||
return gob.NewDecoder(bytes.NewReader(b)).Decode(m)
|
||||
}
|
||||
|
||||
// bytes returns a byte slice representation of the meta.
|
||||
func (m *meta) bytes() ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
if err := gob.NewEncoder(&buf).Encode(m); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
// Terms is a sortable list of terms.
|
||||
type Terms []Term
|
||||
|
||||
func (t Terms) Len() int { return len(t) }
|
||||
func (t Terms) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
|
||||
|
||||
func (t Terms) Less(i, j int) bool {
|
||||
if t[i].Field < t[j].Field {
|
||||
return true
|
||||
}
|
||||
if t[i].Field > t[j].Field {
|
||||
return false
|
||||
}
|
||||
return t[i].Val < t[j].Val
|
||||
}
|
||||
|
||||
// Term is a term for the specified field.
|
||||
type Term struct {
|
||||
Field, Val string
|
||||
}
|
||||
|
||||
func newTerm(b []byte) (t Term, e error) {
|
||||
c := bytes.SplitN(b, []byte{0xff}, 2)
|
||||
if len(c) != 2 {
|
||||
return t, fmt.Errorf("invalid term")
|
||||
}
|
||||
t.Field = string(c[0])
|
||||
t.Val = string(c[1])
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// bytes returns a byte slice representation of the term.
|
||||
func (t *Term) bytes() []byte {
|
||||
b := make([]byte, 0, len(t.Field)+1+len(t.Val))
|
||||
b = append(b, []byte(t.Field)...)
|
||||
b = append(b, 0xff)
|
||||
return append(b, []byte(t.Val)...)
|
||||
}
|
||||
|
||||
// Matcher checks whether a value for a key satisfies a check condition.
|
||||
type Matcher interface {
|
||||
Match(value string) bool
|
||||
}
|
||||
|
||||
// AnyMatcher matches any term value for a field.
|
||||
var AnyMatcher = anyMatcher{}
|
||||
|
||||
type anyMatcher struct{}
|
||||
|
||||
func (anyMatcher) Match(_ string) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// EqualMatcher matches exactly one value for a particular label.
|
||||
type EqualMatcher struct {
|
||||
val string
|
||||
}
|
||||
|
||||
func NewEqualMatcher(val string) *EqualMatcher {
|
||||
return &EqualMatcher{val: val}
|
||||
}
|
||||
|
||||
func (m *EqualMatcher) Match(s string) bool { return m.val == s }
|
||||
|
||||
// RegexpMatcher matches labels for the fixed key for which the value
|
||||
// matches a regular expression.
|
||||
type RegexpMatcher struct {
|
||||
re *regexp.Regexp
|
||||
}
|
||||
|
||||
func NewRegexpMatcher(expr string) (*RegexpMatcher, error) {
|
||||
re, err := regexp.Compile(expr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &RegexpMatcher{re: re}, nil
|
||||
}
|
||||
|
||||
func (m *RegexpMatcher) Match(s string) bool { return m.re.MatchString(s) }
|
||||
|
||||
// DocID is a unique identifier for a document.
|
||||
type DocID uint64
|
||||
|
||||
func newDocID(b []byte) DocID {
|
||||
return DocID(decodeUint64(b))
|
||||
}
|
||||
|
||||
func (d DocID) bytes() []byte {
|
||||
return encodeUint64(uint64(d))
|
||||
}
|
||||
|
||||
type termid uint64
|
||||
|
||||
func newTermID(b []byte) termid {
|
||||
return termid(decodeUint64(b))
|
||||
}
|
||||
|
||||
func (t termid) bytes() []byte {
|
||||
return encodeUint64(uint64(t))
|
||||
}
|
||||
|
||||
type termids []termid
|
||||
|
||||
func (t termids) Len() int { return len(t) }
|
||||
func (t termids) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
|
||||
func (t termids) Less(i, j int) bool { return t[i] < t[j] }
|
||||
|
||||
// newTermIDs reads a sequence of uvarints from b and appends them
|
||||
// to the term IDs.
|
||||
func newTermIDs(b []byte) (t termids) {
|
||||
for len(b) > 0 {
|
||||
k, n := binary.Uvarint(b)
|
||||
t = append(t, termid(k))
|
||||
b = b[n:]
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
// bytes encodes the term IDs as a sequence of uvarints.
|
||||
func (t termids) bytes() []byte {
|
||||
b := make([]byte, len(t)*binary.MaxVarintLen64)
|
||||
n := 0
|
||||
for _, x := range t {
|
||||
n += binary.PutUvarint(b[n:], uint64(x))
|
||||
}
|
||||
return b[:n]
|
||||
}
|
||||
|
||||
// Batch collects multiple indexing actions and allows to apply them
|
||||
// to the persistet index all at once for improved performance.
|
||||
type Batch struct {
|
||||
ix *Index
|
||||
tx *bolt.Tx
|
||||
meta *meta
|
||||
|
||||
termBkt *bolt.Bucket
|
||||
termidBkt *bolt.Bucket
|
||||
|
||||
docs []*batchDoc
|
||||
terms map[Term]*batchTerm
|
||||
}
|
||||
|
||||
type batchDoc struct {
|
||||
id DocID
|
||||
terms termids
|
||||
}
|
||||
|
||||
type batchTerm struct {
|
||||
id termid // zero if term has not been added yet
|
||||
docs []DocID // documents to be indexed for the term
|
||||
}
|
||||
|
||||
// Add adds a new document with the given terms to the index and
|
||||
// returns a new unique ID for it.
|
||||
// The ID only becomes valid after the batch has been committed successfully.
|
||||
func (b *Batch) Add(terms Terms) DocID {
|
||||
b.meta.LastDocID++
|
||||
id := b.meta.LastDocID
|
||||
tids := make(termids, 0, len(terms))
|
||||
|
||||
// Subtract last document ID before this batch was started.
|
||||
for _, t := range terms {
|
||||
tids = append(tids, b.addTerm(id, t))
|
||||
}
|
||||
|
||||
b.docs = append(b.docs, &batchDoc{id: id, terms: tids})
|
||||
return id
|
||||
}
|
||||
|
||||
// SecondaryIndex indexes the document ID for additional terms. The temrs
|
||||
// are not stored as part of the document's forward index as the initial terms.
|
||||
// The caller has to ensure that the document IDs are added to terms in
|
||||
// increasing order.
|
||||
func (b *Batch) SecondaryIndex(id DocID, terms ...Term) {
|
||||
for _, t := range terms {
|
||||
b.addTerm(id, t)
|
||||
}
|
||||
}
|
||||
|
||||
// addTerm adds the document ID to the term's postings list and returns
|
||||
// the Term's ID.
|
||||
func (b *Batch) addTerm(id DocID, t Term) termid {
|
||||
tb := b.terms[t]
|
||||
// Populate term if necessary and allocate a new ID if it
|
||||
// hasn't been created in the database before.
|
||||
if tb == nil {
|
||||
tb = &batchTerm{docs: make([]DocID, 0, 1024)}
|
||||
b.terms[t] = tb
|
||||
|
||||
if idb := b.termBkt.Get(t.bytes()); idb != nil {
|
||||
tb.id = termid(decodeUint64(idb))
|
||||
} else {
|
||||
b.meta.LastTermID++
|
||||
tb.id = b.meta.LastTermID
|
||||
}
|
||||
}
|
||||
tb.docs = append(tb.docs, id)
|
||||
return tb.id
|
||||
}
|
||||
|
||||
// Commit executes the batched indexing against the underlying index.
|
||||
func (b *Batch) Commit() error {
|
||||
defer b.ix.rwlock.Unlock()
|
||||
// Close read transaction to open a write transaction. The outer rwlock
|
||||
// stil guards against intermittend writes between switching.
|
||||
if err := b.tx.Rollback(); err != nil {
|
||||
return err
|
||||
}
|
||||
err := b.ix.bolt.Update(func(tx *bolt.Tx) error {
|
||||
docsBkt := tx.Bucket(bktDocs)
|
||||
// Add document IDs to forward index,
|
||||
for _, d := range b.docs {
|
||||
if err := docsBkt.Put(d.id.bytes(), d.terms.bytes()); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// Add newly allocated terms.
|
||||
termBkt := tx.Bucket(bktTerms)
|
||||
termidBkt := tx.Bucket(bktTermIDs)
|
||||
|
||||
for t, tb := range b.terms {
|
||||
if tb.id > b.ix.meta.LastTermID {
|
||||
bid := encodeUint64(uint64(tb.id))
|
||||
tby := t.bytes()
|
||||
|
||||
if err := termBkt.Put(tby, bid); err != nil {
|
||||
return fmt.Errorf("setting term failed: %s", err)
|
||||
}
|
||||
if err := termidBkt.Put(bid, tby); err != nil {
|
||||
return fmt.Errorf("setting term failed: %s", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pbtx, err := b.ix.pbuf.Begin(true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := b.writePostingsBatch(tx, pbtx); err != nil {
|
||||
pbtx.Rollback()
|
||||
return err
|
||||
}
|
||||
if err := pbtx.Commit(); err != nil {
|
||||
return err
|
||||
}
|
||||
return b.updateMeta(tx)
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
// Rollback drops all changes applied in the batch.
|
||||
func (b *Batch) Rollback() error {
|
||||
b.ix.rwlock.Unlock()
|
||||
return b.tx.Rollback()
|
||||
}
|
||||
|
||||
// writePostings adds the postings batch to the index.
|
||||
func (b *Batch) writePostingsBatch(kvtx *bolt.Tx, pbtx *pages.Tx) error {
|
||||
skiplist := kvtx.Bucket(bktSkiplist)
|
||||
|
||||
// createPage allocates a new delta-encoded page starting with id as its first entry.
|
||||
createPage := func(id DocID) (page, error) {
|
||||
pg := newPageDelta(make([]byte, pageSize-pages.PageHeaderSize))
|
||||
if err := pg.init(id); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return pg, nil
|
||||
}
|
||||
|
||||
for _, tb := range b.terms {
|
||||
ids := tb.docs
|
||||
|
||||
b, err := skiplist.CreateBucketIfNotExists(tb.id.bytes())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
sl := &boltSkiplistCursor{
|
||||
k: uint64(tb.id),
|
||||
c: b.Cursor(),
|
||||
bkt: b,
|
||||
}
|
||||
|
||||
var (
|
||||
pg page // Page we are currently appending to.
|
||||
pc pageCursor // Its cursor.
|
||||
pid uint64 // Its ID.
|
||||
)
|
||||
// Get the most recent page. If none exist, the entire postings list is new.
|
||||
_, pid, err = sl.seek(math.MaxUint64)
|
||||
if err != nil {
|
||||
if err != io.EOF {
|
||||
return err
|
||||
}
|
||||
// No most recent page for the key exists. The postings list is new and
|
||||
// we have to allocate a new page ID for it.
|
||||
if pg, err = createPage(ids[0]); err != nil {
|
||||
return err
|
||||
}
|
||||
pc = pg.cursor()
|
||||
ids = ids[1:]
|
||||
} else {
|
||||
// Load the most recent page.
|
||||
pdata, err := pbtx.Get(pid)
|
||||
if pdata == nil {
|
||||
return fmt.Errorf("error getting page for ID %q: %s", pid, err)
|
||||
}
|
||||
|
||||
pdatac := make([]byte, len(pdata))
|
||||
// The byte slice is mmaped from bolt. We have to copy it to make modifications.
|
||||
// pdatac := make([]byte, len(pdata))
|
||||
copy(pdatac, pdata)
|
||||
|
||||
pg = newPageDelta(pdatac)
|
||||
pc = pg.cursor()
|
||||
}
|
||||
|
||||
for i := 0; i < len(ids); i++ {
|
||||
if err = pc.append(ids[i]); err == errPageFull {
|
||||
// We couldn't append to the page because it was full.
|
||||
// Store away the old page...
|
||||
if pid == 0 {
|
||||
// The page was new.
|
||||
pid, err = pbtx.Add(pg.data())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
first, err := pc.Seek(0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := sl.append(first, pid); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
if err = pbtx.Set(pid, pg.data()); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// ... and allocate a new page.
|
||||
pid = 0
|
||||
if pg, err = createPage(ids[i]); err != nil {
|
||||
return err
|
||||
}
|
||||
pc = pg.cursor()
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// Save the last page we have written to.
|
||||
if pid == 0 {
|
||||
// The page was new.
|
||||
pid, err = pbtx.Add(pg.data())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
first, err := pc.Seek(0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := sl.append(first, pid); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
if err = pbtx.Set(pid, pg.data()); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// updateMeta updates the index's meta information based on the changes
|
||||
// applied with the batch.
|
||||
func (b *Batch) updateMeta(tx *bolt.Tx) error {
|
||||
b.ix.meta = b.meta
|
||||
bkt := tx.Bucket([]byte(bktMeta))
|
||||
if bkt == nil {
|
||||
return fmt.Errorf("meta bucket not found")
|
||||
}
|
||||
v, err := b.ix.meta.bytes()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error encoding meta: %s", err)
|
||||
}
|
||||
return bkt.Put([]byte(keyMeta), v)
|
||||
}
|
1
index/index_test.go
Normal file
1
index/index_test.go
Normal file
|
@ -0,0 +1 @@
|
|||
package index
|
294
index/iter.go
Normal file
294
index/iter.go
Normal file
|
@ -0,0 +1,294 @@
|
|||
package index
|
||||
|
||||
import (
|
||||
"io"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// An Iterator provides sorted iteration over a list of uint64s.
|
||||
type Iterator interface {
|
||||
// Next retrieves the next document ID in the postings list.
|
||||
Next() (DocID, error)
|
||||
// Seek moves the cursor to ID or the closest following one, if it doesn't exist.
|
||||
// It returns the ID at the position.
|
||||
Seek(id DocID) (DocID, error)
|
||||
}
|
||||
|
||||
type mergeIterator struct {
|
||||
i1, i2 Iterator
|
||||
v1, v2 DocID
|
||||
e1, e2 error
|
||||
}
|
||||
|
||||
func (it *mergeIterator) Next() (DocID, error) {
|
||||
if it.e1 == io.EOF && it.e2 == io.EOF {
|
||||
return 0, io.EOF
|
||||
}
|
||||
if it.e1 != nil {
|
||||
if it.e1 != io.EOF {
|
||||
return 0, it.e1
|
||||
}
|
||||
x := it.v2
|
||||
it.v2, it.e2 = it.i2.Next()
|
||||
return x, nil
|
||||
}
|
||||
if it.e2 != nil {
|
||||
if it.e2 != io.EOF {
|
||||
return 0, it.e2
|
||||
}
|
||||
x := it.v1
|
||||
it.v1, it.e1 = it.i1.Next()
|
||||
return x, nil
|
||||
}
|
||||
if it.v1 < it.v2 {
|
||||
x := it.v1
|
||||
it.v1, it.e1 = it.i1.Next()
|
||||
return x, nil
|
||||
} else if it.v2 < it.v1 {
|
||||
x := it.v2
|
||||
it.v2, it.e2 = it.i2.Next()
|
||||
return x, nil
|
||||
} else {
|
||||
x := it.v1
|
||||
it.v1, it.e1 = it.i1.Next()
|
||||
it.v2, it.e2 = it.i2.Next()
|
||||
return x, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (it *mergeIterator) Seek(id DocID) (DocID, error) {
|
||||
// We just have to advance the first iterator. The next common match is also
|
||||
// the next seeked ID of the intersection.
|
||||
it.v1, it.e1 = it.i1.Seek(id)
|
||||
it.v2, it.e2 = it.i2.Seek(id)
|
||||
return it.Next()
|
||||
}
|
||||
|
||||
// Merge returns a new Iterator over the union of the input iterators.
|
||||
func Merge(its ...Iterator) Iterator {
|
||||
if len(its) == 0 {
|
||||
return nil
|
||||
}
|
||||
i1 := its[0]
|
||||
|
||||
for _, i2 := range its[1:] {
|
||||
i1 = &mergeIterator{i1: i1, i2: i2}
|
||||
}
|
||||
return i1
|
||||
}
|
||||
|
||||
// ExpandIterator walks through the iterator and returns the result list.
|
||||
// The iterator is closed after completion.
|
||||
func ExpandIterator(it Iterator) ([]DocID, error) {
|
||||
var (
|
||||
res = []DocID{}
|
||||
v DocID
|
||||
err error
|
||||
)
|
||||
for v, err = it.Seek(0); err == nil; v, err = it.Next() {
|
||||
res = append(res, v)
|
||||
}
|
||||
if err == io.EOF {
|
||||
return res, nil
|
||||
}
|
||||
return res, err
|
||||
}
|
||||
|
||||
type intersectIterator struct {
|
||||
i1, i2 Iterator
|
||||
v1, v2 DocID
|
||||
e1, e2 error
|
||||
}
|
||||
|
||||
// Intersect returns a new Iterator over the intersection of the input iterators.
|
||||
func Intersect(its ...Iterator) Iterator {
|
||||
if len(its) == 0 {
|
||||
return nil
|
||||
}
|
||||
i1 := its[0]
|
||||
|
||||
for _, i2 := range its[1:] {
|
||||
i1 = &intersectIterator{i1: i1, i2: i2}
|
||||
}
|
||||
return i1
|
||||
}
|
||||
|
||||
func (it *intersectIterator) Next() (DocID, error) {
|
||||
for {
|
||||
if it.e1 != nil {
|
||||
return 0, it.e1
|
||||
}
|
||||
if it.e2 != nil {
|
||||
return 0, it.e2
|
||||
}
|
||||
if it.v1 < it.v2 {
|
||||
it.v1, it.e1 = it.i1.Seek(it.v2)
|
||||
} else if it.v2 < it.v1 {
|
||||
it.v2, it.e2 = it.i2.Seek(it.v1)
|
||||
} else {
|
||||
v := it.v1
|
||||
it.v1, it.e1 = it.i1.Next()
|
||||
it.v2, it.e2 = it.i2.Next()
|
||||
return v, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (it *intersectIterator) Seek(id DocID) (DocID, error) {
|
||||
// We have to advance both iterators. Otherwise, we get a false-positive
|
||||
// match on 0 if only on of the iterators has it.
|
||||
it.v1, it.e1 = it.i1.Seek(id)
|
||||
it.v2, it.e2 = it.i2.Seek(id)
|
||||
return it.Next()
|
||||
}
|
||||
|
||||
// A skiplist iterator iterates through a list of value/pointer pairs.
|
||||
type skiplistIterator interface {
|
||||
// seek returns the value and pointer at or before v.
|
||||
seek(v DocID) (val DocID, ptr uint64, err error)
|
||||
// next returns the next value/pointer pair.
|
||||
next() (val DocID, ptr uint64, err error)
|
||||
}
|
||||
|
||||
// iteratorStore allows to retrieve an iterator based on a key.
|
||||
type iteratorStore interface {
|
||||
get(uint64) (Iterator, error)
|
||||
}
|
||||
|
||||
// skippingIterator implements the iterator interface based on skiplist, which
|
||||
// allows to jump to the iterator closest to the seeked value.
|
||||
//
|
||||
// This iterator allows for speed up in seeks if the underlying data cannot
|
||||
// be searched in O(log n).
|
||||
// Ideally, the skiplist is seekable in O(log n).
|
||||
type skippingIterator struct {
|
||||
skiplist skiplistIterator
|
||||
iterators iteratorStore
|
||||
|
||||
// The iterator holding the next value.
|
||||
cur Iterator
|
||||
}
|
||||
|
||||
// Seek implements the Iterator interface.
|
||||
func (it *skippingIterator) Seek(id DocID) (DocID, error) {
|
||||
_, ptr, err := it.skiplist.seek(id)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
cur, err := it.iterators.get(ptr)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
it.cur = cur
|
||||
|
||||
return it.cur.Seek(id)
|
||||
}
|
||||
|
||||
// Next implements the Iterator interface.
|
||||
func (it *skippingIterator) Next() (DocID, error) {
|
||||
// If next was called initially.
|
||||
// TODO(fabxc): should this just panic and initial call to seek() be required?
|
||||
if it.cur == nil {
|
||||
return it.Seek(0)
|
||||
}
|
||||
|
||||
if id, err := it.cur.Next(); err == nil {
|
||||
return id, nil
|
||||
} else if err != io.EOF {
|
||||
return 0, err
|
||||
}
|
||||
// We reached the end of the current iterator. Get the next iterator through
|
||||
// our skiplist.
|
||||
_, ptr, err := it.skiplist.next()
|
||||
if err != nil {
|
||||
// Here we return the actual io.EOF if we reached the end of the iterator
|
||||
// retrieved from the last skiplist entry.
|
||||
return 0, err
|
||||
}
|
||||
// Iterate over the next iterator.
|
||||
cur, err := it.iterators.get(ptr)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
it.cur = cur
|
||||
|
||||
// Return the first value in the new iterator.
|
||||
return it.cur.Seek(0)
|
||||
}
|
||||
|
||||
// plainListIterator implements the iterator interface on a sorted list of integers.
|
||||
type plainListIterator struct {
|
||||
list list
|
||||
pos int
|
||||
}
|
||||
|
||||
func newPlainListIterator(l []DocID) *plainListIterator {
|
||||
it := &plainListIterator{list: list(l)}
|
||||
sort.Sort(it.list)
|
||||
return it
|
||||
}
|
||||
|
||||
func (it *plainListIterator) Seek(id DocID) (DocID, error) {
|
||||
it.pos = sort.Search(it.list.Len(), func(i int) bool { return it.list[i] >= id })
|
||||
return it.Next()
|
||||
|
||||
}
|
||||
|
||||
func (it *plainListIterator) Next() (DocID, error) {
|
||||
if it.pos >= it.list.Len() {
|
||||
return 0, io.EOF
|
||||
}
|
||||
x := it.list[it.pos]
|
||||
it.pos++
|
||||
return x, nil
|
||||
}
|
||||
|
||||
type list []DocID
|
||||
|
||||
func (l list) Len() int { return len(l) }
|
||||
func (l list) Less(i, j int) bool { return l[i] < l[j] }
|
||||
func (l list) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
|
||||
|
||||
// plainSkiplistIterator implements the skiplistIterator interface on plain
|
||||
// in-memory mapping.
|
||||
type plainSkiplistIterator struct {
|
||||
m map[DocID]uint64
|
||||
keys list
|
||||
pos int
|
||||
}
|
||||
|
||||
func newPlainSkiplistIterator(m map[DocID]uint64) *plainSkiplistIterator {
|
||||
var keys list
|
||||
for k := range m {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Sort(keys)
|
||||
|
||||
return &plainSkiplistIterator{
|
||||
m: m,
|
||||
keys: keys,
|
||||
}
|
||||
}
|
||||
|
||||
// seek implements the skiplistIterator interface.
|
||||
func (it *plainSkiplistIterator) seek(id DocID) (DocID, uint64, error) {
|
||||
pos := sort.Search(len(it.keys), func(i int) bool { return it.keys[i] >= id })
|
||||
// The skiplist iterator points to the element at or before the last value.
|
||||
if pos > 0 && it.keys[pos] > id {
|
||||
it.pos = pos - 1
|
||||
} else {
|
||||
it.pos = pos
|
||||
}
|
||||
return it.next()
|
||||
|
||||
}
|
||||
|
||||
// next implements the skiplistIterator interface.
|
||||
func (it *plainSkiplistIterator) next() (DocID, uint64, error) {
|
||||
if it.pos >= len(it.keys) {
|
||||
return 0, 0, io.EOF
|
||||
}
|
||||
k := it.keys[it.pos]
|
||||
it.pos++
|
||||
return k, it.m[k], nil
|
||||
}
|
228
index/iter_test.go
Normal file
228
index/iter_test.go
Normal file
|
@ -0,0 +1,228 @@
|
|||
package index
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMultiIntersect(t *testing.T) {
|
||||
var cases = []struct {
|
||||
a, b, c []DocID
|
||||
res []DocID
|
||||
}{
|
||||
{
|
||||
a: []DocID{1, 2, 3, 4, 5, 6, 1000, 1001},
|
||||
b: []DocID{2, 4, 5, 6, 7, 8, 999, 1001},
|
||||
c: []DocID{1, 2, 5, 6, 7, 8, 1001, 1200},
|
||||
res: []DocID{2, 5, 6, 1001},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
i1 := newPlainListIterator(c.a)
|
||||
i2 := newPlainListIterator(c.b)
|
||||
i3 := newPlainListIterator(c.c)
|
||||
|
||||
res, err := ExpandIterator(Intersect(i1, i2, i3))
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %s", err)
|
||||
}
|
||||
if !reflect.DeepEqual(res, c.res) {
|
||||
t.Fatalf("Expected %v but got %v", c.res, res)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIntersectIterator(t *testing.T) {
|
||||
var cases = []struct {
|
||||
a, b []DocID
|
||||
res []DocID
|
||||
}{
|
||||
{
|
||||
a: []DocID{1, 2, 3, 4, 5},
|
||||
b: []DocID{6, 7, 8, 9, 10},
|
||||
res: []DocID{},
|
||||
},
|
||||
{
|
||||
a: []DocID{1, 2, 3, 4, 5},
|
||||
b: []DocID{4, 5, 6, 7, 8},
|
||||
res: []DocID{4, 5},
|
||||
},
|
||||
{
|
||||
a: []DocID{1, 2, 3, 4, 9, 10},
|
||||
b: []DocID{1, 4, 5, 6, 7, 8, 10, 11},
|
||||
res: []DocID{1, 4, 10},
|
||||
}, {
|
||||
a: []DocID{1},
|
||||
b: []DocID{0, 1},
|
||||
res: []DocID{1},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
i1 := newPlainListIterator(c.a)
|
||||
i2 := newPlainListIterator(c.b)
|
||||
|
||||
res, err := ExpandIterator(&intersectIterator{i1: i1, i2: i2})
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %s", err)
|
||||
}
|
||||
if !reflect.DeepEqual(res, c.res) {
|
||||
t.Fatalf("Expected %v but got %v", c.res, res)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeIntersect(t *testing.T) {
|
||||
var cases = []struct {
|
||||
a, b, c []DocID
|
||||
res []DocID
|
||||
}{
|
||||
{
|
||||
a: []DocID{1, 2, 3, 4, 5, 6, 1000, 1001},
|
||||
b: []DocID{2, 4, 5, 6, 7, 8, 999, 1001},
|
||||
c: []DocID{1, 2, 5, 6, 7, 8, 1001, 1200},
|
||||
res: []DocID{1, 2, 3, 4, 5, 6, 7, 8, 999, 1000, 1001, 1200},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
i1 := newPlainListIterator(c.a)
|
||||
i2 := newPlainListIterator(c.b)
|
||||
i3 := newPlainListIterator(c.c)
|
||||
|
||||
res, err := ExpandIterator(Merge(i1, i2, i3))
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %s", err)
|
||||
}
|
||||
if !reflect.DeepEqual(res, c.res) {
|
||||
t.Fatalf("Expected %v but got %v", c.res, res)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkIntersect(t *testing.B) {
|
||||
var a, b, c, d []DocID
|
||||
|
||||
for i := 0; i < 10000000; i += 2 {
|
||||
a = append(a, DocID(i))
|
||||
}
|
||||
for i := 5000000; i < 5000100; i += 4 {
|
||||
b = append(b, DocID(i))
|
||||
}
|
||||
for i := 5090000; i < 5090600; i += 4 {
|
||||
b = append(b, DocID(i))
|
||||
}
|
||||
for i := 4990000; i < 5100000; i++ {
|
||||
c = append(c, DocID(i))
|
||||
}
|
||||
for i := 4000000; i < 6000000; i++ {
|
||||
d = append(d, DocID(i))
|
||||
}
|
||||
|
||||
i1 := newPlainListIterator(a)
|
||||
i2 := newPlainListIterator(b)
|
||||
i3 := newPlainListIterator(c)
|
||||
i4 := newPlainListIterator(d)
|
||||
|
||||
t.ResetTimer()
|
||||
|
||||
for i := 0; i < t.N; i++ {
|
||||
if _, err := ExpandIterator(Intersect(i1, i2, i3, i4)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeIterator(t *testing.T) {
|
||||
var cases = []struct {
|
||||
a, b []DocID
|
||||
res []DocID
|
||||
}{
|
||||
{
|
||||
a: []DocID{1, 2, 3, 4, 5},
|
||||
b: []DocID{6, 7, 8, 9, 10},
|
||||
res: []DocID{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
|
||||
},
|
||||
{
|
||||
a: []DocID{1, 2, 3, 4, 5},
|
||||
b: []DocID{4, 5, 6, 7, 8},
|
||||
res: []DocID{1, 2, 3, 4, 5, 6, 7, 8},
|
||||
},
|
||||
{
|
||||
a: []DocID{1, 2, 3, 4, 9, 10},
|
||||
b: []DocID{1, 4, 5, 6, 7, 8, 10, 11},
|
||||
res: []DocID{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
i1 := newPlainListIterator(c.a)
|
||||
i2 := newPlainListIterator(c.b)
|
||||
|
||||
res, err := ExpandIterator(&mergeIterator{i1: i1, i2: i2})
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error: %s", err)
|
||||
}
|
||||
if !reflect.DeepEqual(res, c.res) {
|
||||
t.Fatalf("Expected %v but got %v", c.res, res)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSkippingIterator(t *testing.T) {
|
||||
var cases = []struct {
|
||||
skiplist skiplistIterator
|
||||
its iteratorStore
|
||||
res []DocID
|
||||
}{
|
||||
{
|
||||
skiplist: newPlainSkiplistIterator(map[DocID]uint64{
|
||||
5: 3,
|
||||
50: 2,
|
||||
500: 1,
|
||||
}),
|
||||
its: testIteratorStore{
|
||||
3: newPlainListIterator(list{5, 7, 8, 9}),
|
||||
2: newPlainListIterator(list{54, 60, 61}),
|
||||
1: newPlainListIterator(list{1200, 1300, 100000}),
|
||||
},
|
||||
res: []DocID{5, 7, 8, 9, 54, 60, 61, 1200, 1300, 100000},
|
||||
},
|
||||
{
|
||||
skiplist: newPlainSkiplistIterator(map[DocID]uint64{
|
||||
0: 3,
|
||||
50: 2,
|
||||
}),
|
||||
its: testIteratorStore{
|
||||
3: newPlainListIterator(list{5, 7, 8, 9}),
|
||||
2: newPlainListIterator(list{54, 60, 61}),
|
||||
},
|
||||
res: []DocID{5, 7, 8, 9, 54, 60, 61},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
it := &skippingIterator{
|
||||
skiplist: c.skiplist,
|
||||
iterators: c.its,
|
||||
}
|
||||
res, err := ExpandIterator(it)
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error", err)
|
||||
}
|
||||
if !reflect.DeepEqual(res, c.res) {
|
||||
t.Fatalf("Expected %v but got %v", c.res, res)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type testIteratorStore map[uint64]Iterator
|
||||
|
||||
func (s testIteratorStore) get(id uint64) (Iterator, error) {
|
||||
it, ok := s[id]
|
||||
if !ok {
|
||||
return nil, errNotFound
|
||||
}
|
||||
return it, nil
|
||||
}
|
108
index/page.go
Normal file
108
index/page.go
Normal file
|
@ -0,0 +1,108 @@
|
|||
package index
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"io"
|
||||
)
|
||||
|
||||
const pageSize = 2048
|
||||
|
||||
var errPageFull = errors.New("page full")
|
||||
|
||||
type pageCursor interface {
|
||||
Iterator
|
||||
append(v DocID) error
|
||||
}
|
||||
|
||||
type page interface {
|
||||
cursor() pageCursor
|
||||
init(v DocID) error
|
||||
data() []byte
|
||||
}
|
||||
|
||||
type pageDelta struct {
|
||||
b []byte
|
||||
}
|
||||
|
||||
type pageType uint8
|
||||
|
||||
const (
|
||||
pageTypeDelta pageType = iota
|
||||
)
|
||||
|
||||
func newPageDelta(data []byte) *pageDelta {
|
||||
return &pageDelta{b: data}
|
||||
}
|
||||
|
||||
func (p *pageDelta) init(v DocID) error {
|
||||
// Write first value.
|
||||
binary.PutUvarint(p.b, uint64(v))
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *pageDelta) cursor() pageCursor {
|
||||
return &pageDeltaCursor{data: p.b}
|
||||
}
|
||||
|
||||
func (p *pageDelta) data() []byte {
|
||||
return p.b
|
||||
}
|
||||
|
||||
type pageDeltaCursor struct {
|
||||
data []byte
|
||||
pos int
|
||||
cur DocID
|
||||
}
|
||||
|
||||
func (p *pageDeltaCursor) append(id DocID) error {
|
||||
// Run to the end.
|
||||
_, err := p.Next()
|
||||
for ; err == nil; _, err = p.Next() {
|
||||
// Consume.
|
||||
}
|
||||
if err != io.EOF {
|
||||
return err
|
||||
}
|
||||
if len(p.data)-p.pos < binary.MaxVarintLen64 {
|
||||
return errPageFull
|
||||
}
|
||||
if p.cur >= id {
|
||||
return errOutOfOrder
|
||||
}
|
||||
p.pos += binary.PutUvarint(p.data[p.pos:], uint64(id-p.cur))
|
||||
p.cur = id
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *pageDeltaCursor) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *pageDeltaCursor) Seek(min DocID) (v DocID, err error) {
|
||||
if min < p.cur {
|
||||
p.pos = 0
|
||||
}
|
||||
for v, err = p.Next(); err == nil && v < min; v, err = p.Next() {
|
||||
// Consume.
|
||||
}
|
||||
return p.cur, err
|
||||
}
|
||||
|
||||
func (p *pageDeltaCursor) Next() (DocID, error) {
|
||||
var n int
|
||||
var dv uint64
|
||||
if p.pos == 0 {
|
||||
dv, n = binary.Uvarint(p.data)
|
||||
p.cur = DocID(dv)
|
||||
} else {
|
||||
dv, n = binary.Uvarint(p.data[p.pos:])
|
||||
if n <= 0 || dv == 0 {
|
||||
return 0, io.EOF
|
||||
}
|
||||
p.cur += DocID(dv)
|
||||
}
|
||||
p.pos += n
|
||||
|
||||
return p.cur, nil
|
||||
}
|
116
index/page_test.go
Normal file
116
index/page_test.go
Normal file
|
@ -0,0 +1,116 @@
|
|||
package index
|
||||
|
||||
import (
|
||||
"math/rand"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestPageDelta(t *testing.T) {
|
||||
var (
|
||||
vals []DocID
|
||||
last DocID
|
||||
)
|
||||
for i := 0; i < 10000; i++ {
|
||||
vals = append(vals, last)
|
||||
last += DocID(rand.Int63n(1<<9) + 1)
|
||||
}
|
||||
data := make([]byte, pageSize)
|
||||
page := newPageDelta(data)
|
||||
|
||||
if err := page.init(vals[0]); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
var num int
|
||||
pc := page.cursor()
|
||||
|
||||
for _, v := range vals[1:] {
|
||||
if err := pc.append(v); err != nil {
|
||||
if err == errPageFull {
|
||||
break
|
||||
}
|
||||
t.Fatal(err)
|
||||
}
|
||||
num++
|
||||
}
|
||||
|
||||
res, err := ExpandIterator(pc)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !reflect.DeepEqual(res, vals[:num+1]) {
|
||||
t.Errorf("output did not match")
|
||||
t.Errorf("expected: %v", vals[:num+1])
|
||||
t.Errorf("received: %v", res)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkPageDeltaAppend(b *testing.B) {
|
||||
var (
|
||||
vals []DocID
|
||||
last DocID
|
||||
)
|
||||
for i := 0; i < 10000; i++ {
|
||||
vals = append(vals, last)
|
||||
last += DocID(rand.Int63n(1<<10) + 1)
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
data := make([]byte, pageSize)
|
||||
page := newPageDelta(data)
|
||||
|
||||
if err := page.init(vals[0]); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
pc := page.cursor()
|
||||
|
||||
for _, v := range vals[1:] {
|
||||
if err := pc.append(v); err != nil {
|
||||
if err == errPageFull {
|
||||
break
|
||||
}
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkPageDeltaRead(b *testing.B) {
|
||||
var (
|
||||
vals []DocID
|
||||
last DocID
|
||||
)
|
||||
for i := 0; i < 10000; i++ {
|
||||
vals = append(vals, last)
|
||||
last += DocID(rand.Int63n(1<<10) + 1)
|
||||
}
|
||||
data := make([]byte, pageSize)
|
||||
page := newPageDelta(data)
|
||||
|
||||
if err := page.init(vals[0]); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
pc := page.cursor()
|
||||
|
||||
for _, v := range vals[1:] {
|
||||
if err := pc.append(v); err != nil {
|
||||
if err == errPageFull {
|
||||
break
|
||||
}
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
if _, err := ExpandIterator(pc); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
72
index/postings.go
Normal file
72
index/postings.go
Normal file
|
@ -0,0 +1,72 @@
|
|||
package index
|
||||
|
||||
import (
|
||||
"io"
|
||||
|
||||
"github.com/boltdb/bolt"
|
||||
)
|
||||
|
||||
type iteratorStoreFunc func(k uint64) (Iterator, error)
|
||||
|
||||
func (s iteratorStoreFunc) get(k uint64) (Iterator, error) {
|
||||
return s(k)
|
||||
}
|
||||
|
||||
// boltSkiplistCursor implements the skiplistCurosr interface.
|
||||
//
|
||||
// TODO(fabxc): benchmark the overhead of a bucket per key.
|
||||
// It might be more performant to have all skiplists in the same bucket.
|
||||
//
|
||||
// 20k keys, ~10 skiplist entries avg -> 200k keys, 1 bucket vs 20k buckets, 10 keys
|
||||
//
|
||||
type boltSkiplistCursor struct {
|
||||
// k is currently unused. If the bucket holds entries for more than
|
||||
// just a single key, it will be necessary.
|
||||
k uint64
|
||||
c *bolt.Cursor
|
||||
bkt *bolt.Bucket
|
||||
}
|
||||
|
||||
func (s *boltSkiplistCursor) next() (DocID, uint64, error) {
|
||||
db, pb := s.c.Next()
|
||||
if db == nil {
|
||||
return 0, 0, io.EOF
|
||||
}
|
||||
return newDocID(db), decodeUint64(pb), nil
|
||||
}
|
||||
|
||||
func (s *boltSkiplistCursor) seek(k DocID) (DocID, uint64, error) {
|
||||
db, pb := s.c.Seek(k.bytes())
|
||||
if db == nil {
|
||||
db, pb = s.c.Last()
|
||||
if db == nil {
|
||||
return 0, 0, io.EOF
|
||||
}
|
||||
}
|
||||
did, pid := newDocID(db), decodeUint64(pb)
|
||||
|
||||
if did > k {
|
||||
// If the found entry is behind the seeked ID, try the previous
|
||||
// entry if it exists. The page it points to contains the range of k.
|
||||
dbp, pbp := s.c.Prev()
|
||||
if dbp != nil {
|
||||
did, pid = newDocID(dbp), decodeUint64(pbp)
|
||||
} else {
|
||||
// We skipped before the first entry. The cursor is now out of
|
||||
// state and subsequent calls to Next() will return nothing.
|
||||
// Reset it to the first position.
|
||||
s.c.First()
|
||||
}
|
||||
}
|
||||
return did, pid, nil
|
||||
}
|
||||
|
||||
func (s *boltSkiplistCursor) append(d DocID, p uint64) error {
|
||||
k, _ := s.c.Last()
|
||||
|
||||
if k != nil && decodeUint64(k) >= uint64(d) {
|
||||
return errOutOfOrder
|
||||
}
|
||||
|
||||
return s.bkt.Put(encodeUint64(uint64(d)), encodeUint64(p))
|
||||
}
|
1
pages
1
pages
|
@ -1 +0,0 @@
|
|||
Subproject commit 76be68c231de3265e6d0ee670fe312f80b86b99a
|
5
pages/README.md
Normal file
5
pages/README.md
Normal file
|
@ -0,0 +1,5 @@
|
|||
# pages
|
||||
|
||||
Pages stores pages of blob data. It is essentially a minimal version of
|
||||
BoltDB, where the the B+ tree was removed and replaced by simply writing
|
||||
page-aligned byte slices.
|
782
pages/db.go
Normal file
782
pages/db.go
Normal file
|
@ -0,0 +1,782 @@
|
|||
package pages
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"math"
|
||||
"os"
|
||||
"runtime"
|
||||
"sync"
|
||||
"time"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// These errors can be returned when opening or calling methods on a DB.
|
||||
var (
|
||||
// ErrDatabaseNotOpen is returned when a DB instance is accessed before it
|
||||
// is opened or after it is closed.
|
||||
ErrDatabaseNotOpen = errors.New("database not open")
|
||||
|
||||
// ErrDatabaseOpen is returned when opening a database that is
|
||||
// already open.
|
||||
ErrDatabaseOpen = errors.New("database already open")
|
||||
|
||||
// ErrInvalid is returned when both meta pages on a database are invalid.
|
||||
// This typically occurs when a file is not a bolt database.
|
||||
ErrInvalid = errors.New("invalid database")
|
||||
|
||||
// ErrVersionMismatch is returned when the data file was created with a
|
||||
// different version of Bolt.
|
||||
ErrVersionMismatch = errors.New("version mismatch")
|
||||
|
||||
// ErrChecksum is returned when either meta page checksum does not match.
|
||||
ErrChecksum = errors.New("checksum error")
|
||||
|
||||
// ErrTimeout is returned when a database cannot obtain an exclusive lock
|
||||
// on the data file after the timeout passed to Open().
|
||||
ErrTimeout = errors.New("timeout")
|
||||
|
||||
// ErrNotFound is returned when a user page for an ID could not be found.
|
||||
ErrNotFound = errors.New("not found")
|
||||
|
||||
ErrTxClosed = errors.New("transaction closed")
|
||||
|
||||
ErrTxNotWritable = errors.New("transaction not writable")
|
||||
)
|
||||
|
||||
// Marker value that indicates that a file is a pagebuf file.
|
||||
const magic uint32 = 0xAFFEAFFE
|
||||
|
||||
// The data file version.
|
||||
const version = 1
|
||||
|
||||
// The largest step that can be taken when remapping the mmap.
|
||||
const maxMmapStep = 1 << 30 // 1GB
|
||||
|
||||
// defaultPageSize of the underlying buffers is set to the OS page size.
|
||||
var defaultPageSize = os.Getpagesize()
|
||||
|
||||
// DB is an interface providing access to persistent byte chunks that
|
||||
// are backed by memory-mapped pages.
|
||||
type DB struct {
|
||||
// If you want to read the entire database fast, you can set MmapFlag to
|
||||
// syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
|
||||
MmapFlags int
|
||||
|
||||
// AllocSize is the amount of space allocated when the database
|
||||
// needs to create new pages. This is done to amortize the cost
|
||||
// of truncate() and fsync() when growing the data file.
|
||||
AllocSize int
|
||||
|
||||
path string // location of the pagebuf file
|
||||
file *os.File // the opened file of path
|
||||
opened bool
|
||||
data *[maxMapSize]byte
|
||||
dataref []byte // mmap'ed readonly, write throws SEGV
|
||||
datasz int
|
||||
filesz int // current on disk file size
|
||||
pageSize int
|
||||
meta0 *meta
|
||||
meta1 *meta
|
||||
freelist *freelist
|
||||
rwtx *Tx
|
||||
txs []*Tx
|
||||
|
||||
pagePool sync.Pool
|
||||
|
||||
rwlock sync.Mutex // Allows only one writer at a time.
|
||||
metalock sync.Mutex // Protects meta page access.
|
||||
mmaplock sync.RWMutex // Protects mmap access during remapping
|
||||
|
||||
ops struct {
|
||||
writeAt func(b []byte, off int64) (n int, err error)
|
||||
}
|
||||
}
|
||||
|
||||
// Options defines configuration parameters with which a PageBuf is initialized.
|
||||
type Options struct {
|
||||
// Timeout is the amount of time to wait to obtain a file lock.
|
||||
// When set to zero it will wait indefinitely. This option is only
|
||||
// available on Darwin and Linux.
|
||||
Timeout time.Duration
|
||||
|
||||
// Sets the DB.MmapFlags flag before memory mapping the file.
|
||||
MmapFlags int
|
||||
|
||||
// XXX(fabxc): potentially allow setting different allocation strategies
|
||||
// to fit different use cases.
|
||||
|
||||
// InitialMmapSize is the initial mmap size of the database
|
||||
// in bytes.
|
||||
//
|
||||
// If <=0, the initial map size is 0.
|
||||
// If initialMmapSize is smaller than the previous database size,
|
||||
// it takes no effect.
|
||||
InitialMmapSize int
|
||||
|
||||
// PageSize defines a custom page size used. It cannot be changed later.
|
||||
// Must be a multiple of the operating system's default page size.
|
||||
PageSize int
|
||||
}
|
||||
|
||||
// DefaultOptions specifies a set of default parameters used when a pagebuf
|
||||
// is opened without explicit options.
|
||||
var DefaultOptions = Options{
|
||||
// Use the OS's default page size.
|
||||
PageSize: defaultPageSize,
|
||||
}
|
||||
|
||||
// Default values if not set in a DB instance.
|
||||
const (
|
||||
DefaultAllocSize = 16 * 1024 * 1024
|
||||
)
|
||||
|
||||
// Open and create a new database under the given path.
|
||||
func Open(path string, mode os.FileMode, o *Options) (*DB, error) {
|
||||
db := &DB{
|
||||
opened: true,
|
||||
}
|
||||
|
||||
// Set default options if no options are provided.
|
||||
if o == nil {
|
||||
o = &DefaultOptions
|
||||
}
|
||||
db.MmapFlags = o.MmapFlags
|
||||
|
||||
db.AllocSize = DefaultAllocSize
|
||||
|
||||
flag := os.O_RDWR
|
||||
|
||||
// Open data file and separate sync handler for metadata writes.
|
||||
db.path = path
|
||||
var err error
|
||||
if db.file, err = os.OpenFile(db.path, flag|os.O_CREATE, mode); err != nil {
|
||||
_ = db.close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Lock file so that other processes using pagebuf in read-write mode cannot
|
||||
// use the underlying data at the same time.
|
||||
if err := flock(db, mode, true, o.Timeout); err != nil {
|
||||
_ = db.close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Default values for test hooks
|
||||
db.ops.writeAt = db.file.WriteAt
|
||||
|
||||
// Initialize the database if it doesn't exist.
|
||||
if info, err := db.file.Stat(); err != nil {
|
||||
return nil, err
|
||||
} else if info.Size() == 0 {
|
||||
// Initialize new files with meta pages.
|
||||
if err := db.init(o.PageSize); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
// Read the first meta page to determine the page size.
|
||||
var buf [0x1000]byte
|
||||
if _, err := db.file.ReadAt(buf[:], 0); err == nil {
|
||||
m := db.pageInBuffer(buf[:], 0).meta()
|
||||
if err := m.validate(); err != nil {
|
||||
// We cannot verify which page sizes are used.
|
||||
return nil, fmt.Errorf("cannot read page size: %s", err)
|
||||
} else {
|
||||
db.pageSize = int(m.pageSize)
|
||||
}
|
||||
} else {
|
||||
return nil, fmt.Errorf("reading first meta page failed: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize page pool.
|
||||
db.pagePool = sync.Pool{
|
||||
New: func() interface{} {
|
||||
return make([]byte, db.pageSize)
|
||||
},
|
||||
}
|
||||
|
||||
// Memory map the data file.
|
||||
if err := db.mmap(o.InitialMmapSize); err != nil {
|
||||
_ = db.close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Read in the freelist.
|
||||
db.freelist = newFreelist()
|
||||
db.freelist.read(db.page(db.meta().freelist))
|
||||
|
||||
// Mark the database as opened and return.
|
||||
return db, nil
|
||||
}
|
||||
|
||||
func validatePageSize(psz int) error {
|
||||
// Max value the content length can hold.
|
||||
if defaultPageSize > math.MaxUint16 {
|
||||
return fmt.Errorf("invalid page size %d", psz)
|
||||
}
|
||||
// Page size must be a multiple of OS page size so we stay
|
||||
// page aligned.
|
||||
if psz < defaultPageSize {
|
||||
if defaultPageSize%psz != 0 {
|
||||
return fmt.Errorf("invalid page size %d", psz)
|
||||
}
|
||||
} else if psz > defaultPageSize {
|
||||
if psz%defaultPageSize != 0 {
|
||||
return fmt.Errorf("invalid page size %d", psz)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// init creates a new database file and initializes its meta pages.
|
||||
func (db *DB) init(psz int) error {
|
||||
if err := validatePageSize(psz); err != nil {
|
||||
return err
|
||||
}
|
||||
// Set the page size to the OS page size.
|
||||
db.pageSize = psz
|
||||
|
||||
// Create two meta pages on a buffer.
|
||||
buf := make([]byte, db.pageSize*4)
|
||||
for i := 0; i < 2; i++ {
|
||||
p := db.pageInBuffer(buf[:], pgid(i))
|
||||
p.id = pgid(i)
|
||||
p.flags = pageFlagMeta
|
||||
|
||||
// Initialize the meta page.
|
||||
m := p.meta()
|
||||
m.magic = magic
|
||||
m.version = version
|
||||
m.pageSize = uint32(db.pageSize)
|
||||
m.freelist = 2
|
||||
m.txid = txid(i)
|
||||
m.pgid = 4 // TODO(fabxc): we initialize with zero pages, what to do here?
|
||||
m.checksum = m.sum64()
|
||||
}
|
||||
|
||||
// Write an empty freelist at page 3.
|
||||
p := db.pageInBuffer(buf[:], pgid(2))
|
||||
p.id = pgid(2)
|
||||
p.flags = pageFlagFreelist
|
||||
p.count = 0
|
||||
|
||||
// Write the first empty page.
|
||||
p = db.pageInBuffer(buf[:], pgid(3))
|
||||
p.id = pgid(3)
|
||||
p.flags = pageFlagData
|
||||
p.count = 0
|
||||
|
||||
// Write the buffer to our data file.
|
||||
if _, err := db.ops.writeAt(buf, 0); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := fdatasync(db); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Sync executes fdatasync() against the database file handle.
|
||||
func (db *DB) Sync() error { return fdatasync(db) }
|
||||
|
||||
// Close synchronizes and closes the memory-mapped pagebuf file.
|
||||
func (db *DB) Close() error {
|
||||
db.rwlock.Lock()
|
||||
defer db.rwlock.Unlock()
|
||||
|
||||
db.metalock.Lock()
|
||||
defer db.metalock.Unlock()
|
||||
|
||||
db.mmaplock.RLock()
|
||||
defer db.mmaplock.RUnlock()
|
||||
|
||||
return db.close()
|
||||
}
|
||||
|
||||
func (db *DB) close() error {
|
||||
if !db.opened {
|
||||
return nil
|
||||
}
|
||||
|
||||
db.opened = false
|
||||
db.freelist = nil
|
||||
db.ops.writeAt = nil
|
||||
|
||||
// Close the mmap.
|
||||
if err := db.munmap(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Close file handles.
|
||||
if db.file != nil {
|
||||
// Close the file descriptor.
|
||||
if err := db.file.Close(); err != nil {
|
||||
return fmt.Errorf("db file close: %s", err)
|
||||
}
|
||||
db.file = nil
|
||||
}
|
||||
|
||||
db.path = ""
|
||||
return nil
|
||||
}
|
||||
|
||||
// Update executes a function within the context of a read-write managed transaction.
|
||||
// If no error is returned from the function then the transaction is committed.
|
||||
// If an error is returned then the entire transaction is rolled back.
|
||||
// Any error that is returned from the function or returned from the commit is
|
||||
// returned from the Update() method.
|
||||
//
|
||||
// Attempting to manually commit or rollback within the function will cause a panic.
|
||||
func (db *DB) Update(fn func(*Tx) error) error {
|
||||
t, err := db.Begin(true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Make sure the transaction rolls back in the event of a panic.
|
||||
defer func() {
|
||||
if t.db != nil {
|
||||
t.rollback()
|
||||
}
|
||||
}()
|
||||
|
||||
// Mark as a managed tx so that the inner function cannot manually commit.
|
||||
t.managed = true
|
||||
|
||||
// If an error is returned from the function then rollback and return error.
|
||||
err = fn(t)
|
||||
t.managed = false
|
||||
if err != nil {
|
||||
_ = t.Rollback()
|
||||
return err
|
||||
}
|
||||
|
||||
return t.Commit()
|
||||
}
|
||||
|
||||
// View executes a function within the context of a managed read-only transaction.
|
||||
// Any error that is returned from the function is returned from the View() method.
|
||||
//
|
||||
// Attempting to manually rollback within the function will cause a panic.
|
||||
func (db *DB) View(fn func(*Tx) error) error {
|
||||
t, err := db.Begin(false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Make sure the transaction rolls back in the event of a panic.
|
||||
defer func() {
|
||||
if t.db != nil {
|
||||
t.rollback()
|
||||
}
|
||||
}()
|
||||
|
||||
// Mark as a managed tx so that the inner function cannot manually rollback.
|
||||
t.managed = true
|
||||
|
||||
// If an error is returned from the function then pass it through.
|
||||
err = fn(t)
|
||||
t.managed = false
|
||||
if err != nil {
|
||||
_ = t.Rollback()
|
||||
return err
|
||||
}
|
||||
|
||||
if err := t.Rollback(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// pageExists checks whether the page with the given id exists.
|
||||
func (db *DB) pageExists(id pgid) bool {
|
||||
// The page exists if it is not in the freelist or out of the data range.
|
||||
return !db.freelist.cache[pgid(id)] && int(id+1)*db.pageSize < db.datasz
|
||||
}
|
||||
|
||||
// page retrieves a page reference from the mmap based on the current page size.
|
||||
func (db *DB) page(id pgid) *page {
|
||||
pos := id * pgid(db.pageSize)
|
||||
return (*page)(unsafe.Pointer(&db.data[pos]))
|
||||
}
|
||||
|
||||
// pageInBuffer retrieves a page reference from a given byte array based on the current
|
||||
// page size.
|
||||
func (db *DB) pageInBuffer(b []byte, id pgid) *page {
|
||||
pos := id * pgid(db.pageSize)
|
||||
return (*page)(unsafe.Pointer(&b[pos]))
|
||||
}
|
||||
|
||||
// meta retrieves the current meta page reference.
|
||||
func (db *DB) meta() *meta {
|
||||
// We have to return the meta with the highest txid which doesn't fail
|
||||
// validation. Otherwise, we can cause errors when in fact the database is
|
||||
// in a consistent state. metaA is the one with the higher txid.
|
||||
metaA := db.meta0
|
||||
metaB := db.meta1
|
||||
if db.meta1.txid > db.meta0.txid {
|
||||
metaA = db.meta1
|
||||
metaB = db.meta0
|
||||
}
|
||||
|
||||
// Use higher meta page if valid. Otherwise fallback to previous, if valid.
|
||||
if err := metaA.validate(); err == nil {
|
||||
return metaA
|
||||
} else if err := metaB.validate(); err == nil {
|
||||
return metaB
|
||||
}
|
||||
|
||||
// This should never be reached, because both meta1 and meta0 were validated
|
||||
// on mmap() and we do fsync() on every write.
|
||||
panic("pagebuf.PageBuf.meta(): invalid meta pages")
|
||||
}
|
||||
|
||||
// allocate returns a contiguous block of memory starting at a given page.
|
||||
func (db *DB) allocate(count int) (*page, error) {
|
||||
// Allocate a temporary buffer for the page.
|
||||
var buf []byte
|
||||
if count == 1 {
|
||||
buf = db.pagePool.Get().([]byte)
|
||||
} else {
|
||||
buf = make([]byte, count*db.pageSize)
|
||||
}
|
||||
p := (*page)(unsafe.Pointer(&buf[0]))
|
||||
p.overflow = uint32(count - 1)
|
||||
|
||||
// Use pages from the freelist if they are available.
|
||||
if p.id = db.freelist.allocate(count); p.id != 0 {
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// Resize mmap() if we're at the end.
|
||||
p.id = db.rwtx.meta.pgid
|
||||
var minsz = int((p.id+pgid(count))+1) * db.pageSize
|
||||
if minsz >= db.datasz {
|
||||
if err := db.mmap(minsz); err != nil {
|
||||
return nil, fmt.Errorf("mmap allocate error: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Move the page id high water mark.
|
||||
db.rwtx.meta.pgid += pgid(count)
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// grow grows the size of the database to the given sz.
|
||||
func (db *DB) grow(sz int) error {
|
||||
// Ignore if the new size is less than available file size.
|
||||
if sz <= db.filesz {
|
||||
return nil
|
||||
}
|
||||
|
||||
// If the data is smaller than the alloc size then only allocate what's needed.
|
||||
// Once it goes over the allocation size then allocate in chunks.
|
||||
if db.datasz < db.AllocSize {
|
||||
sz = db.datasz
|
||||
} else {
|
||||
sz += db.AllocSize
|
||||
}
|
||||
|
||||
// Truncate and fsync to ensure file size metadata is flushed.
|
||||
// https://github.com/boltdb/bolt/issues/284
|
||||
if runtime.GOOS != "windows" {
|
||||
if err := db.file.Truncate(int64(sz)); err != nil {
|
||||
return fmt.Errorf("file resize error: %s", err)
|
||||
}
|
||||
}
|
||||
if err := db.file.Sync(); err != nil {
|
||||
return fmt.Errorf("file sync error: %s", err)
|
||||
}
|
||||
|
||||
db.filesz = sz
|
||||
return nil
|
||||
}
|
||||
|
||||
// mmap opens the underlying memory-mapped file and initializes it.
|
||||
// minsz is the minimum size that the mmap can be.
|
||||
func (db *DB) mmap(minsz int) error {
|
||||
db.mmaplock.Lock()
|
||||
defer db.mmaplock.Unlock()
|
||||
|
||||
info, err := db.file.Stat()
|
||||
if err != nil {
|
||||
return fmt.Errorf("mmap stat error: %s", err)
|
||||
} else if int(info.Size()) < db.pageSize*2 {
|
||||
return fmt.Errorf("file size too small")
|
||||
}
|
||||
|
||||
// Ensure the size is at least the minimum size.
|
||||
var size = int(info.Size())
|
||||
if size < minsz {
|
||||
size = minsz
|
||||
}
|
||||
size, err = db.mmapSize(size)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Unmap existing data before continuing.
|
||||
if err := db.munmap(); err != nil {
|
||||
return err
|
||||
}
|
||||
// Memory-map the data file as a byte slice.
|
||||
if err := mmap(db, size); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Save references to the meta pages.
|
||||
db.meta0 = db.page(0).meta()
|
||||
db.meta1 = db.page(1).meta()
|
||||
|
||||
// Validate the meta pages. We only return an error if both meta pages fail
|
||||
// validation, since meta0 failing validation means that it wasn't saved
|
||||
// properly -- but we can recover using meta1. And vice-versa.
|
||||
err0 := db.meta0.validate()
|
||||
err1 := db.meta1.validate()
|
||||
if err0 != nil && err1 != nil {
|
||||
return err0
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// munmap unmaps the data file from memory.
|
||||
func (db *DB) munmap() error {
|
||||
if err := munmap(db); err != nil {
|
||||
return fmt.Errorf("unmap error: %s", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// mmapSize determines the appropriate size for the mmap given the current size
|
||||
// of the database. The minimum size is 32KB and doubles until it reaches 1GB.
|
||||
// Returns an error if the new mmap size is greater than the max allowed.
|
||||
func (db *DB) mmapSize(size int) (int, error) {
|
||||
// Double the size from 32KB until 1GB.
|
||||
for i := uint(15); i <= 30; i++ {
|
||||
if size <= 1<<i {
|
||||
return 1 << i, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Verify the requested size is not above the maximum allowed.
|
||||
if size > maxMapSize {
|
||||
return 0, fmt.Errorf("mmap too large")
|
||||
}
|
||||
|
||||
// If larger than 1GB then grow by 1GB at a time.
|
||||
sz := int64(size)
|
||||
if remainder := sz % int64(maxMmapStep); remainder > 0 {
|
||||
sz += int64(maxMmapStep) - remainder
|
||||
}
|
||||
|
||||
// Ensure that the mmap size is a multiple of the page size.
|
||||
// This should always be true since we're incrementing in MBs.
|
||||
pageSize := int64(db.pageSize)
|
||||
if (sz % pageSize) != 0 {
|
||||
sz = ((sz / pageSize) + 1) * pageSize
|
||||
}
|
||||
|
||||
// If we've exceeded the max size then only grow up to the max size.
|
||||
if sz > maxMapSize {
|
||||
sz = maxMapSize
|
||||
}
|
||||
|
||||
return int(sz), nil
|
||||
}
|
||||
|
||||
func (db *DB) String() string {
|
||||
return fmt.Sprintf("PageBuf<%s>", db.path)
|
||||
}
|
||||
|
||||
// Path returns the path to the currently opened pagebuf file.
|
||||
func (db *DB) Path() string {
|
||||
return db.path
|
||||
}
|
||||
|
||||
// Begin starts a new transaction.
|
||||
// Multiple read-only transactions can be used concurrently but only one
|
||||
// write transaction can be used at a time. Starting multiple write transactions
|
||||
// will cause the calls to block and be serialized until the current write
|
||||
// transaction finishes.
|
||||
//
|
||||
// Transactions should not be dependent on one another. Opening a read
|
||||
// transaction and a write transaction in the same goroutine can cause the
|
||||
// writer to deadlock because the database periodically needs to re-mmap itself
|
||||
// as it grows and it cannot do that while a read transaction is open.
|
||||
//
|
||||
// If a long running read transaction (for example, a snapshot transaction) is
|
||||
// needed, you might want to set PageBuf.InitialMmapSize to a large enough value
|
||||
// to avoid potential blocking of write transaction.
|
||||
//
|
||||
// IMPORTANT: You must close read-only transactions after you are finished or
|
||||
// else the database will not reclaim old pages.
|
||||
func (db *DB) Begin(writable bool) (*Tx, error) {
|
||||
if writable {
|
||||
return db.beginRWTx()
|
||||
}
|
||||
return db.beginTx()
|
||||
}
|
||||
|
||||
func (db *DB) beginTx() (*Tx, error) {
|
||||
// Lock the meta pages while we initialize the transaction. We obtain
|
||||
// the meta lock before the mmap lock because that's the order that the
|
||||
// write transaction will obtain them.
|
||||
db.metalock.Lock()
|
||||
|
||||
// Obtain a read-only lock on the mmap. When the mmap is remapped it will
|
||||
// obtain a write lock so all transactions must finish before it can be
|
||||
// remapped.
|
||||
db.mmaplock.RLock()
|
||||
|
||||
// Exit if the database is not open yet.
|
||||
if !db.opened {
|
||||
db.mmaplock.RUnlock()
|
||||
db.metalock.Unlock()
|
||||
return nil, ErrDatabaseNotOpen
|
||||
}
|
||||
|
||||
// Create a transaction associated with the database.
|
||||
t := &Tx{}
|
||||
t.init(db)
|
||||
|
||||
// Keep track of transaction until it closes.
|
||||
db.txs = append(db.txs, t)
|
||||
|
||||
// Unlock the meta pages.
|
||||
db.metalock.Unlock()
|
||||
|
||||
return t, nil
|
||||
}
|
||||
|
||||
func (db *DB) beginRWTx() (*Tx, error) {
|
||||
// Obtain writer lock. This is released by the transaction when it closes.
|
||||
// This enforces only one writer transaction at a time.
|
||||
db.rwlock.Lock()
|
||||
|
||||
// Once we have the writer lock then we can lock the meta pages so that
|
||||
// we can set up the transaction.
|
||||
db.metalock.Lock()
|
||||
defer db.metalock.Unlock()
|
||||
|
||||
// Exit if the database is not open yet.
|
||||
if !db.opened {
|
||||
db.rwlock.Unlock()
|
||||
return nil, ErrDatabaseNotOpen
|
||||
}
|
||||
|
||||
// Create a transaction associated with the database.
|
||||
t := &Tx{writable: true}
|
||||
t.init(db)
|
||||
db.rwtx = t
|
||||
|
||||
// Free any pages associated with closed read-only transactions.
|
||||
var minid txid = 0xFFFFFFFFFFFFFFFF
|
||||
for _, t := range db.txs {
|
||||
if t.meta.txid < minid {
|
||||
minid = t.meta.txid
|
||||
}
|
||||
}
|
||||
if minid > 0 {
|
||||
db.freelist.release(minid - 1)
|
||||
}
|
||||
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// removeTx removes a transaction from the database.
|
||||
func (db *DB) removeTx(tx *Tx) {
|
||||
// Release the read lock on the mmap.
|
||||
db.mmaplock.RUnlock()
|
||||
|
||||
// Use the meta lock to restrict access to the DB object.
|
||||
db.metalock.Lock()
|
||||
|
||||
// Remove the transaction.
|
||||
for i, t := range db.txs {
|
||||
if t == tx {
|
||||
db.txs = append(db.txs[:i], db.txs[i+1:]...)
|
||||
break
|
||||
}
|
||||
}
|
||||
// Unlock the meta pages.
|
||||
db.metalock.Unlock()
|
||||
}
|
||||
|
||||
// Size represents a valid page size.
|
||||
type Size int8
|
||||
|
||||
// The valid sizes for allocated pages.
|
||||
const (
|
||||
Size512 Size = -3
|
||||
Size1024 = -2
|
||||
Size2048 = -1
|
||||
Size4096 = 0
|
||||
Size8192 = 1
|
||||
)
|
||||
|
||||
const (
|
||||
upageSizeMin = Size512
|
||||
upageSizeMax = Size8192
|
||||
)
|
||||
|
||||
type meta struct {
|
||||
magic uint32
|
||||
version uint32
|
||||
pageSize uint32
|
||||
flags uint32
|
||||
freelist pgid
|
||||
txid txid
|
||||
pgid pgid
|
||||
checksum uint64
|
||||
}
|
||||
|
||||
// validate checks the marker bytes and version of the meta page to ensure it matches this binary.
|
||||
func (m *meta) validate() error {
|
||||
if m.magic != magic {
|
||||
return ErrInvalid
|
||||
} else if m.version != version {
|
||||
return ErrVersionMismatch
|
||||
} else if m.checksum != 0 && m.checksum != m.sum64() {
|
||||
return ErrChecksum
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// copy copies one meta object to another.
|
||||
func (m *meta) copy(dest *meta) {
|
||||
*dest = *m
|
||||
}
|
||||
|
||||
// write writes the meta onto a page.
|
||||
func (m *meta) write(p *page) {
|
||||
if m.freelist >= m.pgid {
|
||||
panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
|
||||
}
|
||||
|
||||
// Page id is either going to be 0 or 1 which we can determine by the transaction ID.
|
||||
p.id = pgid(m.txid % 2)
|
||||
p.flags |= pageFlagMeta
|
||||
|
||||
// Calculate the checksum.
|
||||
m.checksum = m.sum64()
|
||||
|
||||
m.copy(p.meta())
|
||||
}
|
||||
|
||||
// generates the checksum for the meta.
|
||||
func (m *meta) sum64() uint64 {
|
||||
var h = fnv.New64a()
|
||||
_, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:])
|
||||
return h.Sum64()
|
||||
}
|
||||
|
||||
// _assert will panic with a given formatted message if the given condition is false.
|
||||
func _assert(condition bool, msg string, v ...interface{}) {
|
||||
if !condition {
|
||||
panic(fmt.Sprintf("assertion failed: "+msg, v...))
|
||||
}
|
||||
}
|
248
pages/freelist.go
Normal file
248
pages/freelist.go
Normal file
|
@ -0,0 +1,248 @@
|
|||
package pages
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// freelist represents a list of all pages that are available for allocation.
|
||||
// It also tracks pages that have been freed but are still in use by open transactions.
|
||||
type freelist struct {
|
||||
ids []pgid // all free and available free page ids.
|
||||
pending map[txid][]pgid // mapping of soon-to-be free page ids by tx.
|
||||
cache map[pgid]bool // fast lookup of all free and pending page ids.
|
||||
}
|
||||
|
||||
// newFreelist returns an empty, initialized freelist.
|
||||
func newFreelist() *freelist {
|
||||
return &freelist{
|
||||
pending: make(map[txid][]pgid),
|
||||
cache: make(map[pgid]bool),
|
||||
}
|
||||
}
|
||||
|
||||
// size returns the size of the page after serialization.
|
||||
func (f *freelist) size() int {
|
||||
return PageHeaderSize + (int(unsafe.Sizeof(pgid(0))) * f.count())
|
||||
}
|
||||
|
||||
// count returns count of pages on the freelist
|
||||
func (f *freelist) count() int {
|
||||
return f.free_count() + f.pending_count()
|
||||
}
|
||||
|
||||
// free_count returns count of free pages
|
||||
func (f *freelist) free_count() int {
|
||||
return len(f.ids)
|
||||
}
|
||||
|
||||
// pending_count returns count of pending pages
|
||||
func (f *freelist) pending_count() int {
|
||||
var count int
|
||||
for _, list := range f.pending {
|
||||
count += len(list)
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// all returns a list of all free ids and all pending ids in one sorted list.
|
||||
func (f *freelist) all() []pgid {
|
||||
m := make(pgids, 0)
|
||||
|
||||
for _, list := range f.pending {
|
||||
m = append(m, list...)
|
||||
}
|
||||
|
||||
sort.Sort(m)
|
||||
return pgids(f.ids).merge(m)
|
||||
}
|
||||
|
||||
// allocate returns the starting page id of a contiguous list of pages of a given size.
|
||||
// If a contiguous block cannot be found then 0 is returned.
|
||||
func (f *freelist) allocate(n int) pgid {
|
||||
if len(f.ids) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
var initial, previd pgid
|
||||
for i, id := range f.ids {
|
||||
if id <= 1 {
|
||||
panic(fmt.Sprintf("invalid page allocation: %d", id))
|
||||
}
|
||||
|
||||
// Reset initial page if this is not contiguous.
|
||||
if previd == 0 || id-previd != 1 {
|
||||
initial = id
|
||||
}
|
||||
|
||||
// If we found a contiguous block then remove it and return it.
|
||||
if (id-initial)+1 == pgid(n) {
|
||||
// If we're allocating off the beginning then take the fast path
|
||||
// and just adjust the existing slice. This will use extra memory
|
||||
// temporarily but the append() in free() will realloc the slice
|
||||
// as is necessary.
|
||||
if (i + 1) == n {
|
||||
f.ids = f.ids[i+1:]
|
||||
} else {
|
||||
copy(f.ids[i-n+1:], f.ids[i+1:])
|
||||
f.ids = f.ids[:len(f.ids)-n]
|
||||
}
|
||||
|
||||
// Remove from the free cache.
|
||||
for i := pgid(0); i < pgid(n); i++ {
|
||||
delete(f.cache, initial+i)
|
||||
}
|
||||
|
||||
return initial
|
||||
}
|
||||
|
||||
previd = id
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// free releases a page and its overflow for a given transaction id.
|
||||
// If the page is already free then a panic will occur.
|
||||
func (f *freelist) free(txid txid, p *page) {
|
||||
if p.id <= 1 {
|
||||
panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.id))
|
||||
}
|
||||
|
||||
// Free page and all its overflow pages.
|
||||
var ids = f.pending[txid]
|
||||
for id := p.id; id <= p.id+pgid(p.overflow); id++ {
|
||||
// Verify that page is not already free.
|
||||
if f.cache[id] {
|
||||
panic(fmt.Sprintf("page %d already freed", id))
|
||||
}
|
||||
|
||||
// Add to the freelist and cache.
|
||||
ids = append(ids, id)
|
||||
f.cache[id] = true
|
||||
}
|
||||
f.pending[txid] = ids
|
||||
}
|
||||
|
||||
// release moves all page ids for a transaction id (or older) to the freelist.
|
||||
func (f *freelist) release(txid txid) {
|
||||
m := make(pgids, 0)
|
||||
for tid, ids := range f.pending {
|
||||
if tid <= txid {
|
||||
// Move transaction's pending pages to the available freelist.
|
||||
// Don't remove from the cache since the page is still free.
|
||||
m = append(m, ids...)
|
||||
delete(f.pending, tid)
|
||||
}
|
||||
}
|
||||
sort.Sort(m)
|
||||
f.ids = pgids(f.ids).merge(m)
|
||||
}
|
||||
|
||||
// rollback removes the pages from a given pending tx.
|
||||
func (f *freelist) rollback(txid txid) {
|
||||
// Remove page ids from cache.
|
||||
for _, id := range f.pending[txid] {
|
||||
delete(f.cache, id)
|
||||
}
|
||||
|
||||
// Remove pages from pending list.
|
||||
delete(f.pending, txid)
|
||||
}
|
||||
|
||||
// freed returns whether a given page is in the free list.
|
||||
func (f *freelist) freed(pgid pgid) bool {
|
||||
return f.cache[pgid]
|
||||
}
|
||||
|
||||
// read initializes the freelist from a freelist page.
|
||||
func (f *freelist) read(p *page) {
|
||||
// If the page.count is at the max uint16 value (64k) then it's considered
|
||||
// an overflow and the size of the freelist is stored as the first element.
|
||||
idx, count := 0, int(p.count)
|
||||
if count == 0xFFFF {
|
||||
idx = 1
|
||||
count = int(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0])
|
||||
}
|
||||
|
||||
// Copy the list of page ids from the freelist.
|
||||
if count == 0 {
|
||||
f.ids = nil
|
||||
} else {
|
||||
ids := ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[idx:count]
|
||||
f.ids = make([]pgid, len(ids))
|
||||
copy(f.ids, ids)
|
||||
|
||||
// Make sure they're sorted.
|
||||
sort.Sort(pgids(f.ids))
|
||||
}
|
||||
|
||||
// Rebuild the page cache.
|
||||
f.reindex()
|
||||
}
|
||||
|
||||
// write writes the page ids onto a freelist page. All free and pending ids are
|
||||
// saved to disk since in the event of a program crash, all pending ids will
|
||||
// become free.
|
||||
func (f *freelist) write(p *page) error {
|
||||
// Combine the old free pgids and pgids waiting on an open transaction.
|
||||
ids := f.all()
|
||||
|
||||
// Update the header flag.
|
||||
p.flags |= pageFlagFreelist
|
||||
|
||||
// The page.count can only hold up to 64k elements so if we overflow that
|
||||
// number then we handle it by putting the size in the first element.
|
||||
if len(ids) == 0 {
|
||||
p.count = uint16(len(ids))
|
||||
} else if len(ids) < 0xFFFF {
|
||||
p.count = uint16(len(ids))
|
||||
copy(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[:], ids)
|
||||
} else {
|
||||
p.count = 0xFFFF
|
||||
((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0] = pgid(len(ids))
|
||||
copy(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[1:], ids)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// reload reads the freelist from a page and filters out pending items.
|
||||
func (f *freelist) reload(p *page) {
|
||||
f.read(p)
|
||||
|
||||
// Build a cache of only pending pages.
|
||||
pcache := make(map[pgid]bool)
|
||||
for _, pendingIDs := range f.pending {
|
||||
for _, pendingID := range pendingIDs {
|
||||
pcache[pendingID] = true
|
||||
}
|
||||
}
|
||||
|
||||
// Check each page in the freelist and build a new available freelist
|
||||
// with any pages not in the pending lists.
|
||||
var a []pgid
|
||||
for _, id := range f.ids {
|
||||
if !pcache[id] {
|
||||
a = append(a, id)
|
||||
}
|
||||
}
|
||||
f.ids = a
|
||||
|
||||
// Once the available list is rebuilt then rebuild the free cache so that
|
||||
// it includes the available and pending free pages.
|
||||
f.reindex()
|
||||
}
|
||||
|
||||
// reindex rebuilds the free cache based on available and pending free lists.
|
||||
func (f *freelist) reindex() {
|
||||
f.cache = make(map[pgid]bool)
|
||||
for _, id := range f.ids {
|
||||
f.cache[id] = true
|
||||
}
|
||||
for _, pendingIDs := range f.pending {
|
||||
for _, pendingID := range pendingIDs {
|
||||
f.cache[pendingID] = true
|
||||
}
|
||||
}
|
||||
}
|
158
pages/freelist_test.go
Normal file
158
pages/freelist_test.go
Normal file
|
@ -0,0 +1,158 @@
|
|||
package pages
|
||||
|
||||
import (
|
||||
"math/rand"
|
||||
"reflect"
|
||||
"sort"
|
||||
"testing"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// Ensure that a page is added to a transaction's freelist.
|
||||
func TestFreelist_free(t *testing.T) {
|
||||
f := newFreelist()
|
||||
f.free(100, &page{id: 12})
|
||||
if !reflect.DeepEqual([]pgid{12}, f.pending[100]) {
|
||||
t.Fatalf("exp=%v; got=%v", []pgid{12}, f.pending[100])
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that a page and its overflow is added to a transaction's freelist.
|
||||
func TestFreelist_free_overflow(t *testing.T) {
|
||||
f := newFreelist()
|
||||
f.free(100, &page{id: 12, overflow: 3})
|
||||
if exp := []pgid{12, 13, 14, 15}; !reflect.DeepEqual(exp, f.pending[100]) {
|
||||
t.Fatalf("exp=%v; got=%v", exp, f.pending[100])
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that a transaction's free pages can be released.
|
||||
func TestFreelist_release(t *testing.T) {
|
||||
f := newFreelist()
|
||||
f.free(100, &page{id: 12, overflow: 1})
|
||||
f.free(100, &page{id: 9})
|
||||
f.free(102, &page{id: 39})
|
||||
f.release(100)
|
||||
f.release(101)
|
||||
if exp := []pgid{9, 12, 13}; !reflect.DeepEqual(exp, f.ids) {
|
||||
t.Fatalf("exp=%v; got=%v", exp, f.ids)
|
||||
}
|
||||
|
||||
f.release(102)
|
||||
if exp := []pgid{9, 12, 13, 39}; !reflect.DeepEqual(exp, f.ids) {
|
||||
t.Fatalf("exp=%v; got=%v", exp, f.ids)
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that a freelist can find contiguous blocks of pages.
|
||||
func TestFreelist_allocate(t *testing.T) {
|
||||
f := &freelist{ids: []pgid{3, 4, 5, 6, 7, 9, 12, 13, 18}}
|
||||
if id := int(f.allocate(3)); id != 3 {
|
||||
t.Fatalf("exp=3; got=%v", id)
|
||||
}
|
||||
if id := int(f.allocate(1)); id != 6 {
|
||||
t.Fatalf("exp=6; got=%v", id)
|
||||
}
|
||||
if id := int(f.allocate(3)); id != 0 {
|
||||
t.Fatalf("exp=0; got=%v", id)
|
||||
}
|
||||
if id := int(f.allocate(2)); id != 12 {
|
||||
t.Fatalf("exp=12; got=%v", id)
|
||||
}
|
||||
if id := int(f.allocate(1)); id != 7 {
|
||||
t.Fatalf("exp=7; got=%v", id)
|
||||
}
|
||||
if id := int(f.allocate(0)); id != 0 {
|
||||
t.Fatalf("exp=0; got=%v", id)
|
||||
}
|
||||
if id := int(f.allocate(0)); id != 0 {
|
||||
t.Fatalf("exp=0; got=%v", id)
|
||||
}
|
||||
if exp := []pgid{9, 18}; !reflect.DeepEqual(exp, f.ids) {
|
||||
t.Fatalf("exp=%v; got=%v", exp, f.ids)
|
||||
}
|
||||
|
||||
if id := int(f.allocate(1)); id != 9 {
|
||||
t.Fatalf("exp=9; got=%v", id)
|
||||
}
|
||||
if id := int(f.allocate(1)); id != 18 {
|
||||
t.Fatalf("exp=18; got=%v", id)
|
||||
}
|
||||
if id := int(f.allocate(1)); id != 0 {
|
||||
t.Fatalf("exp=0; got=%v", id)
|
||||
}
|
||||
if exp := []pgid{}; !reflect.DeepEqual(exp, f.ids) {
|
||||
t.Fatalf("exp=%v; got=%v", exp, f.ids)
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that a freelist can deserialize from a freelist page.
|
||||
func TestFreelist_read(t *testing.T) {
|
||||
// Create a page.
|
||||
var buf [4096]byte
|
||||
page := (*page)(unsafe.Pointer(&buf[0]))
|
||||
page.flags = pageFlagFreelist
|
||||
page.count = 2
|
||||
|
||||
// Insert 2 page ids.
|
||||
ids := (*[3]pgid)(unsafe.Pointer(&page.ptr))
|
||||
ids[0] = 23
|
||||
ids[1] = 50
|
||||
|
||||
// Deserialize page into a freelist.
|
||||
f := newFreelist()
|
||||
f.read(page)
|
||||
|
||||
// Ensure that there are two page ids in the freelist.
|
||||
if exp := []pgid{23, 50}; !reflect.DeepEqual(exp, f.ids) {
|
||||
t.Fatalf("exp=%v; got=%v", exp, f.ids)
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that a freelist can serialize into a freelist page.
|
||||
func TestFreelist_write(t *testing.T) {
|
||||
// Create a freelist and write it to a page.
|
||||
var buf [4096]byte
|
||||
f := &freelist{ids: []pgid{12, 39}, pending: make(map[txid][]pgid)}
|
||||
f.pending[100] = []pgid{28, 11}
|
||||
f.pending[101] = []pgid{3}
|
||||
p := (*page)(unsafe.Pointer(&buf[0]))
|
||||
if err := f.write(p); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Read the page back out.
|
||||
f2 := newFreelist()
|
||||
f2.read(p)
|
||||
|
||||
// Ensure that the freelist is correct.
|
||||
// All pages should be present and in reverse order.
|
||||
if exp := []pgid{3, 11, 12, 28, 39}; !reflect.DeepEqual(exp, f2.ids) {
|
||||
t.Fatalf("exp=%v; got=%v", exp, f2.ids)
|
||||
}
|
||||
}
|
||||
|
||||
func Benchmark_FreelistRelease10K(b *testing.B) { benchmark_FreelistRelease(b, 10000) }
|
||||
func Benchmark_FreelistRelease100K(b *testing.B) { benchmark_FreelistRelease(b, 100000) }
|
||||
func Benchmark_FreelistRelease1000K(b *testing.B) { benchmark_FreelistRelease(b, 1000000) }
|
||||
func Benchmark_FreelistRelease10000K(b *testing.B) { benchmark_FreelistRelease(b, 10000000) }
|
||||
|
||||
func benchmark_FreelistRelease(b *testing.B, size int) {
|
||||
ids := randomPgids(size)
|
||||
pending := randomPgids(len(ids) / 400)
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
f := &freelist{ids: ids, pending: map[txid][]pgid{1: pending}}
|
||||
f.release(1)
|
||||
}
|
||||
}
|
||||
|
||||
func randomPgids(n int) []pgid {
|
||||
rand.Seed(42)
|
||||
pgids := make(pgids, n)
|
||||
for i := range pgids {
|
||||
pgids[i] = pgid(rand.Int63())
|
||||
}
|
||||
sort.Sort(pgids)
|
||||
return pgids
|
||||
}
|
103
pages/page.go
Normal file
103
pages/page.go
Normal file
|
@ -0,0 +1,103 @@
|
|||
package pages
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"sort"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
const PageHeaderSize = int(unsafe.Offsetof(((*page)(nil)).ptr))
|
||||
|
||||
const (
|
||||
// pageFlag{head,tail,body}?
|
||||
pageFlagMeta = 0x02
|
||||
pageFlagFreelist = 0x04
|
||||
pageFlagData = 0x08
|
||||
)
|
||||
|
||||
type pgid uint64
|
||||
|
||||
type page struct {
|
||||
id pgid
|
||||
flags uint16
|
||||
count uint16
|
||||
overflow uint32
|
||||
ptr uintptr
|
||||
}
|
||||
|
||||
// typ returns a human readable page type string.
|
||||
func (p *page) typ() string {
|
||||
if (p.flags & pageFlagMeta) != 0 {
|
||||
return "meta"
|
||||
} else if (p.flags & pageFlagFreelist) != 0 {
|
||||
return "freelist"
|
||||
} else if (p.flags & pageFlagData) != 0 {
|
||||
return "data"
|
||||
}
|
||||
return fmt.Sprintf("unknown<%02x>", p.flags)
|
||||
}
|
||||
|
||||
func (p *page) String() string {
|
||||
return fmt.Sprintf("page<%s,%016x>", p.typ(), p.id)
|
||||
}
|
||||
|
||||
// meta returns a pointer to the metadata section of a page.
|
||||
func (p *page) meta() *meta {
|
||||
return (*meta)(unsafe.Pointer(&p.ptr))
|
||||
}
|
||||
|
||||
// dump writes n bytes of the page to STDERR as hex output.
|
||||
func (p *page) hexdump(n int) {
|
||||
buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:n]
|
||||
fmt.Fprintf(os.Stderr, "%x\n", buf)
|
||||
}
|
||||
|
||||
type pages []*page
|
||||
|
||||
func (s pages) Len() int { return len(s) }
|
||||
func (s pages) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
|
||||
func (s pages) Less(i, j int) bool { return s[i].id < s[j].id }
|
||||
|
||||
type pgids []pgid
|
||||
|
||||
func (s pgids) Len() int { return len(s) }
|
||||
func (s pgids) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
|
||||
func (s pgids) Less(i, j int) bool { return s[i] < s[j] }
|
||||
|
||||
// merge returns the sorted union of a and b.
|
||||
func (a pgids) merge(b pgids) pgids {
|
||||
// Return the opposite slice if one is nil.
|
||||
if len(a) == 0 {
|
||||
return b
|
||||
} else if len(b) == 0 {
|
||||
return a
|
||||
}
|
||||
|
||||
// Create a list to hold all elements from both lists.
|
||||
merged := make(pgids, 0, len(a)+len(b))
|
||||
|
||||
// Assign lead to the slice with a lower starting value, follow to the higher value.
|
||||
lead, follow := a, b
|
||||
if b[0] < a[0] {
|
||||
lead, follow = b, a
|
||||
}
|
||||
|
||||
// Continue while there are elements in the lead.
|
||||
for len(lead) > 0 {
|
||||
// Merge largest prefix of lead that is ahead of follow[0].
|
||||
n := sort.Search(len(lead), func(i int) bool { return lead[i] > follow[0] })
|
||||
merged = append(merged, lead[:n]...)
|
||||
if n >= len(lead) {
|
||||
break
|
||||
}
|
||||
|
||||
// Swap lead and follow.
|
||||
lead, follow = follow, lead[n:]
|
||||
}
|
||||
|
||||
// Append what's left in follow.
|
||||
merged = append(merged, follow...)
|
||||
|
||||
return merged
|
||||
}
|
69
pages/page_test.go
Normal file
69
pages/page_test.go
Normal file
|
@ -0,0 +1,69 @@
|
|||
package pages
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"sort"
|
||||
"testing"
|
||||
"testing/quick"
|
||||
)
|
||||
|
||||
// Ensure that the page type can be returned in human readable format.
|
||||
func TestPage_typ(t *testing.T) {
|
||||
if typ := (&page{flags: pageFlagData}).typ(); typ != "data" {
|
||||
t.Fatalf("exp=branch; got=%v", typ)
|
||||
}
|
||||
if typ := (&page{flags: pageFlagMeta}).typ(); typ != "meta" {
|
||||
t.Fatalf("exp=meta; got=%v", typ)
|
||||
}
|
||||
if typ := (&page{flags: pageFlagFreelist}).typ(); typ != "freelist" {
|
||||
t.Fatalf("exp=freelist; got=%v", typ)
|
||||
}
|
||||
if typ := (&page{flags: 20000}).typ(); typ != "unknown<4e20>" {
|
||||
t.Fatalf("exp=unknown<4e20>; got=%v", typ)
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that the hexdump debugging function doesn't blow up.
|
||||
func TestPage_dump(t *testing.T) {
|
||||
(&page{id: 256}).hexdump(16)
|
||||
}
|
||||
|
||||
func TestPgids_merge(t *testing.T) {
|
||||
a := pgids{4, 5, 6, 10, 11, 12, 13, 27}
|
||||
b := pgids{1, 3, 8, 9, 25, 30}
|
||||
c := a.merge(b)
|
||||
if !reflect.DeepEqual(c, pgids{1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 25, 27, 30}) {
|
||||
t.Errorf("mismatch: %v", c)
|
||||
}
|
||||
|
||||
a = pgids{4, 5, 6, 10, 11, 12, 13, 27, 35, 36}
|
||||
b = pgids{8, 9, 25, 30}
|
||||
c = a.merge(b)
|
||||
if !reflect.DeepEqual(c, pgids{4, 5, 6, 8, 9, 10, 11, 12, 13, 25, 27, 30, 35, 36}) {
|
||||
t.Errorf("mismatch: %v", c)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPgids_merge_quick(t *testing.T) {
|
||||
if err := quick.Check(func(a, b pgids) bool {
|
||||
// Sort incoming lists.
|
||||
sort.Sort(a)
|
||||
sort.Sort(b)
|
||||
|
||||
// Merge the two lists together.
|
||||
got := a.merge(b)
|
||||
|
||||
// The expected value should be the two lists combined and sorted.
|
||||
exp := append(a, b...)
|
||||
sort.Sort(exp)
|
||||
|
||||
if !reflect.DeepEqual(exp, got) {
|
||||
t.Errorf("\nexp=%+v\ngot=%+v\n", exp, got)
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}, nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
7
pages/pages_amd64.go
Normal file
7
pages/pages_amd64.go
Normal file
|
@ -0,0 +1,7 @@
|
|||
package pages
|
||||
|
||||
// maxMapSize represents the largest mmap size supported by pagebuf.
|
||||
const maxMapSize = 0xFFFFFFFFFFFF // 256TB
|
||||
|
||||
// maxAllocSize is the size used when creating array pointers.
|
||||
const maxAllocSize = 0x7FFFFFFF
|
10
pages/pages_linux.go
Normal file
10
pages/pages_linux.go
Normal file
|
@ -0,0 +1,10 @@
|
|||
package pages
|
||||
|
||||
import (
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// fdatasync flushes written data to a file descriptor.
|
||||
func fdatasync(pb *Pagebuf) error {
|
||||
return syscall.Fdatasync(int(pb.file.Fd()))
|
||||
}
|
89
pages/pages_unix.go
Normal file
89
pages/pages_unix.go
Normal file
|
@ -0,0 +1,89 @@
|
|||
// +build !windows,!plan9,!solaris
|
||||
|
||||
package pages
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
"time"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// flock acquires an advisory lock on a file descriptor.
|
||||
func flock(pb *DB, mode os.FileMode, exclusive bool, timeout time.Duration) error {
|
||||
var t time.Time
|
||||
for {
|
||||
// If we're beyond our timeout then return an error.
|
||||
// This can only occur after we've attempted a flock once.
|
||||
if t.IsZero() {
|
||||
t = time.Now()
|
||||
} else if timeout > 0 && time.Since(t) > timeout {
|
||||
return ErrTimeout
|
||||
}
|
||||
flag := syscall.LOCK_SH
|
||||
if exclusive {
|
||||
flag = syscall.LOCK_EX
|
||||
}
|
||||
|
||||
// Otherwise attempt to obtain an exclusive lock.
|
||||
err := syscall.Flock(int(pb.file.Fd()), flag|syscall.LOCK_NB)
|
||||
if err == nil {
|
||||
return nil
|
||||
} else if err != syscall.EWOULDBLOCK {
|
||||
return err
|
||||
}
|
||||
|
||||
// Wait for a bit and try again.
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
// funlock releases an advisory lock on a file descriptor.
|
||||
func funlock(pb *DB) error {
|
||||
return syscall.Flock(int(pb.file.Fd()), syscall.LOCK_UN)
|
||||
}
|
||||
|
||||
// mmap memory maps a PageBuf's data file.
|
||||
func mmap(pb *DB, sz int) error {
|
||||
// Map the data file to memory.
|
||||
b, err := syscall.Mmap(int(pb.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|pb.MmapFlags)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Advise the kernel that the mmap is accessed randomly.
|
||||
if err := madvise(b, syscall.MADV_RANDOM); err != nil {
|
||||
return fmt.Errorf("madvise: %s", err)
|
||||
}
|
||||
|
||||
// Save the original byte slice and convert to a byte array pointer.
|
||||
pb.dataref = b
|
||||
pb.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0]))
|
||||
pb.datasz = sz
|
||||
return nil
|
||||
}
|
||||
|
||||
// munmap unmaps a PageBuf's data file from memory.
|
||||
func munmap(pb *DB) error {
|
||||
// Ignore the unmap if we have no mapped data.
|
||||
if pb.dataref == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Unmap using the original byte slice.
|
||||
err := syscall.Munmap(pb.dataref)
|
||||
pb.dataref = nil
|
||||
pb.data = nil
|
||||
pb.datasz = 0
|
||||
return err
|
||||
}
|
||||
|
||||
// NOTE: This function is copied from stdlib because it is not available on darwin.
|
||||
func madvise(b []byte, advice int) (err error) {
|
||||
_, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])), uintptr(len(b)), uintptr(advice))
|
||||
if e1 != 0 {
|
||||
err = e1
|
||||
}
|
||||
return
|
||||
}
|
8
pages/pagessync_unix.go
Normal file
8
pages/pagessync_unix.go
Normal file
|
@ -0,0 +1,8 @@
|
|||
// +build !windows,!plan9,!linux,!openbsd
|
||||
|
||||
package pages
|
||||
|
||||
// fdatasync flushes written data to a file descriptor.
|
||||
func fdatasync(pb *DB) error {
|
||||
return pb.file.Sync()
|
||||
}
|
384
pages/tx.go
Normal file
384
pages/tx.go
Normal file
|
@ -0,0 +1,384 @@
|
|||
package pages
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// txid represents the internal transaction identifier.
|
||||
type txid uint64
|
||||
|
||||
// Tx represents a read-only or read/write transaction on the page buffer.
|
||||
// Read-only transactions can be used for retrieving pages.
|
||||
// Read/write transactions can retrieve and write pages.
|
||||
//
|
||||
// IMPORTANT: You must commit or rollback transactions when you are done with
|
||||
// them. Pages can not be reclaimed by the writer until no more transactions
|
||||
// are using them. A long running read transaction can cause the database to
|
||||
// quickly grow.
|
||||
type Tx struct {
|
||||
writable bool
|
||||
managed bool
|
||||
db *DB
|
||||
meta *meta
|
||||
pages map[pgid]*page
|
||||
delPages map[pgid]bool
|
||||
|
||||
// WriteFlag specifies the flag for write-related methods like WriteTo().
|
||||
// Tx opens the database file with the specified flag to copy the data.
|
||||
//
|
||||
// By default, the flag is unset, which works well for mostly in-memory
|
||||
// workloads. For databases that are much larger than available RAM,
|
||||
// set the flag to syscall.O_DIRECT to avoid trashing the page cache.
|
||||
WriteFlag int
|
||||
}
|
||||
|
||||
// init initializes the transaction.
|
||||
func (tx *Tx) init(db *DB) {
|
||||
tx.db = db
|
||||
tx.pages = nil
|
||||
|
||||
// Copy the meta page since it can be changed by the writer.
|
||||
tx.meta = &meta{}
|
||||
db.meta().copy(tx.meta)
|
||||
|
||||
// Increment the transaction id and add a page cache for writable transactions.
|
||||
if tx.writable {
|
||||
tx.pages = make(map[pgid]*page)
|
||||
tx.delPages = make(map[pgid]bool)
|
||||
tx.meta.txid += txid(1)
|
||||
}
|
||||
}
|
||||
|
||||
// ID returns the transaction id.
|
||||
func (tx *Tx) ID() uint64 {
|
||||
return uint64(tx.meta.txid)
|
||||
}
|
||||
|
||||
// Size returns current database size in bytes as seen by this transaction.
|
||||
func (tx *Tx) Size() int64 {
|
||||
return int64(tx.meta.pgid) * int64(tx.db.pageSize)
|
||||
}
|
||||
|
||||
// DB returns a reference to the database that created the transaction.
|
||||
func (tx *Tx) DB() *DB {
|
||||
return tx.db
|
||||
}
|
||||
|
||||
// Writable returns whether the transaction can perform write operations.
|
||||
func (tx *Tx) Writable() bool {
|
||||
return tx.writable
|
||||
}
|
||||
|
||||
// Rollback closes the transaction and ignores all previous updates. Read-only
|
||||
// transactions must be rolled back and not committed.
|
||||
func (tx *Tx) Rollback() error {
|
||||
_assert(!tx.managed, "managed tx rollback not allowed")
|
||||
if tx.db == nil {
|
||||
return ErrTxClosed
|
||||
}
|
||||
tx.rollback()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (tx *Tx) rollback() {
|
||||
if tx.db == nil {
|
||||
return
|
||||
}
|
||||
if tx.writable {
|
||||
tx.db.freelist.rollback(tx.meta.txid)
|
||||
tx.db.freelist.reload(tx.db.page(tx.db.meta().freelist))
|
||||
}
|
||||
tx.close()
|
||||
}
|
||||
|
||||
func (tx *Tx) close() {
|
||||
if tx.db == nil {
|
||||
return
|
||||
}
|
||||
if tx.writable {
|
||||
// Remove transaction ref & writer lock.
|
||||
tx.db.rwtx = nil
|
||||
tx.db.rwlock.Unlock()
|
||||
} else {
|
||||
tx.db.removeTx(tx)
|
||||
}
|
||||
|
||||
// Clear all references.
|
||||
tx.db = nil
|
||||
tx.meta = nil
|
||||
tx.pages = nil
|
||||
}
|
||||
|
||||
// page returns a reference to the page with a given id.
|
||||
// If page has been written to then a temporary buffered page is returned.
|
||||
func (tx *Tx) page(id pgid) *page {
|
||||
// Check the dirty pages first.
|
||||
if tx.pages != nil {
|
||||
if p, ok := tx.pages[id]; ok {
|
||||
return p
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise return directly from the mmap.
|
||||
return tx.db.page(id)
|
||||
}
|
||||
|
||||
func (tx *Tx) pageExists(id pgid) bool {
|
||||
// Check whether the page was modified during this transaction.
|
||||
if tx.pages != nil {
|
||||
if _, ok := tx.pages[id]; ok {
|
||||
return true
|
||||
}
|
||||
}
|
||||
// Check whether page was deleted during this transaction.
|
||||
if tx.delPages != nil {
|
||||
if tx.delPages[id] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
// The page was not touched within this transaction. Fallthrough to
|
||||
// the database's check.
|
||||
return tx.db.pageExists(id)
|
||||
}
|
||||
|
||||
// allocate returns a contiguous block of memory starting at a given page.
|
||||
func (tx *Tx) allocate(count int) (*page, error) {
|
||||
p, err := tx.db.allocate(count)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Save to our page cache.
|
||||
tx.pages[p.id] = p
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// Commit writes all changes to disk and updates the meta page.
|
||||
// Returns an error if a disk write error occurs, or if Commit is
|
||||
// called on a read-only transaction.
|
||||
func (tx *Tx) Commit() error {
|
||||
_assert(!tx.managed, "managed tx commit not allowed")
|
||||
if tx.db == nil {
|
||||
return ErrTxClosed
|
||||
} else if !tx.writable {
|
||||
return ErrTxNotWritable
|
||||
}
|
||||
|
||||
// TODO(benbjohnson): Use vectorized I/O to write out dirty pages.
|
||||
|
||||
opgid := tx.meta.pgid
|
||||
|
||||
// Free the freelist and allocate new pages for it. This will overestimate
|
||||
// the size of the freelist but not underestimate the size (which would be bad).
|
||||
tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist))
|
||||
p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1)
|
||||
if err != nil {
|
||||
tx.rollback()
|
||||
return err
|
||||
}
|
||||
if err := tx.db.freelist.write(p); err != nil {
|
||||
tx.rollback()
|
||||
return err
|
||||
}
|
||||
tx.meta.freelist = p.id
|
||||
|
||||
// If the high water mark has moved up then attempt to grow the database.
|
||||
if tx.meta.pgid > opgid {
|
||||
if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil {
|
||||
tx.rollback()
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// Write dirty pages to disk.
|
||||
if err := tx.write(); err != nil {
|
||||
tx.rollback()
|
||||
return err
|
||||
}
|
||||
|
||||
// Write meta to disk.
|
||||
if err := tx.writeMeta(); err != nil {
|
||||
tx.rollback()
|
||||
return err
|
||||
}
|
||||
|
||||
// Finalize the transaction.
|
||||
tx.close()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// write writes any dirty pages to disk.
|
||||
func (tx *Tx) write() error {
|
||||
// Sort pages by id.
|
||||
pages := make(pages, 0, len(tx.pages))
|
||||
for _, p := range tx.pages {
|
||||
pages = append(pages, p)
|
||||
}
|
||||
// Clear out page cache early.
|
||||
tx.pages = make(map[pgid]*page)
|
||||
sort.Sort(pages)
|
||||
|
||||
// Write pages to disk in order.
|
||||
for _, p := range pages {
|
||||
size := (int(p.overflow) + 1) * tx.db.pageSize
|
||||
offset := int64(p.id) * int64(tx.db.pageSize)
|
||||
|
||||
// Write out page in "max allocation" sized chunks.
|
||||
ptr := (*[maxAllocSize]byte)(unsafe.Pointer(p))
|
||||
for {
|
||||
// Limit our write to our max allocation size.
|
||||
sz := size
|
||||
if sz > maxAllocSize-1 {
|
||||
sz = maxAllocSize - 1
|
||||
}
|
||||
|
||||
// Write chunk to disk.
|
||||
buf := ptr[:sz]
|
||||
if _, err := tx.db.ops.writeAt(buf, offset); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Exit inner for loop if we've written all the chunks.
|
||||
size -= sz
|
||||
if size == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
// Otherwise move offset forward and move pointer to next chunk.
|
||||
offset += int64(sz)
|
||||
ptr = (*[maxAllocSize]byte)(unsafe.Pointer(&ptr[sz]))
|
||||
}
|
||||
}
|
||||
|
||||
if err := fdatasync(tx.db); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Put small pages back to page pool.
|
||||
for _, p := range pages {
|
||||
// Ignore page sizes over 1 page.
|
||||
// These are allocated using make() instead of the page pool.
|
||||
if int(p.overflow) != 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:tx.db.pageSize]
|
||||
|
||||
// See https://go.googlesource.com/go/+/f03c9202c43e0abb130669852082117ca50aa9b1
|
||||
for i := range buf {
|
||||
buf[i] = 0
|
||||
}
|
||||
tx.db.pagePool.Put(buf)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// writeMeta writes the meta to the disk.
|
||||
func (tx *Tx) writeMeta() error {
|
||||
// Create a temporary buffer for the meta page.
|
||||
buf := make([]byte, tx.db.pageSize)
|
||||
p := tx.db.pageInBuffer(buf, 0)
|
||||
tx.meta.write(p)
|
||||
|
||||
// Write the meta page to file.
|
||||
if _, err := tx.db.ops.writeAt(buf, int64(p.id)*int64(tx.db.pageSize)); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := fdatasync(tx.db); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Get retrieves the bytes stored in the page with the given id.
|
||||
// The returned byte slice is only valid for the duration of the transaction.
|
||||
func (tx *Tx) Get(id uint64) ([]byte, error) {
|
||||
if !tx.pageExists(pgid(id)) {
|
||||
return nil, ErrNotFound
|
||||
}
|
||||
|
||||
p := tx.page(pgid(id))
|
||||
size := int(p.overflow)*tx.db.pageSize - PageHeaderSize + int(p.count)
|
||||
b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[:size]
|
||||
|
||||
return b, nil
|
||||
}
|
||||
|
||||
// Add creates a new page with the given content. The inserted byte slice
|
||||
// will be padded at the end to fit the next largest page size. Retrieving the page
|
||||
// will return the padding as well.
|
||||
// Inserted data should hence have included termination markers.
|
||||
func (tx *Tx) Add(c []byte) (uint64, error) {
|
||||
l := len(c) + PageHeaderSize // total size required
|
||||
n := 1 // number of pages required
|
||||
for n*tx.db.pageSize < l {
|
||||
n++
|
||||
}
|
||||
if l > maxAllocSize {
|
||||
return 0, fmt.Errorf("page of size %d too large", l)
|
||||
}
|
||||
p, err := tx.allocate(n)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("page alloc error: %s", err)
|
||||
}
|
||||
p.flags |= pageFlagData
|
||||
// count holds the length used in the last page.
|
||||
p.count = uint16(l - (n-1)*tx.db.pageSize)
|
||||
|
||||
b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[:]
|
||||
copy(b, c)
|
||||
|
||||
return uint64(p.id), nil
|
||||
}
|
||||
|
||||
// Del deletes the page witht he given ID.
|
||||
func (tx *Tx) Del(id uint64) error {
|
||||
if !tx.pageExists(pgid(id)) {
|
||||
return ErrNotFound
|
||||
}
|
||||
|
||||
tx.db.freelist.free(tx.meta.txid, tx.db.page(pgid(id)))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Set overwrites the page with the given ID with c.
|
||||
func (tx *Tx) Set(id uint64, c []byte) error {
|
||||
if !tx.pageExists(pgid(id)) {
|
||||
return ErrNotFound
|
||||
}
|
||||
p := tx.db.page(pgid(id))
|
||||
|
||||
l := len(c) + PageHeaderSize // total size required
|
||||
n := int(p.overflow + 1)
|
||||
// The contents must fit into the previously allocated pages.
|
||||
if l > n*tx.db.pageSize {
|
||||
return fmt.Errorf("invalid overwrite size")
|
||||
}
|
||||
|
||||
// Allocate a temporary buffer for the page.
|
||||
var buf []byte
|
||||
if n == 1 {
|
||||
buf = tx.db.pagePool.Get().([]byte)
|
||||
} else {
|
||||
buf = make([]byte, n*tx.db.pageSize)
|
||||
}
|
||||
np := tx.db.pageInBuffer(buf, 0)
|
||||
*np = *p
|
||||
// count holds the length used in the last page.
|
||||
np.count = uint16(l - (n-1)*tx.db.pageSize)
|
||||
|
||||
// TODO(fabxc): Potential performance improvement point could be using c directly.
|
||||
// Just copy it for now.
|
||||
b := (*[maxAllocSize]byte)(unsafe.Pointer(&np.ptr))[:]
|
||||
copy(b, c)
|
||||
|
||||
tx.pages[pgid(id)] = np
|
||||
// TODO(fabxc): truncate and free pages that are no longer needed.
|
||||
|
||||
return nil
|
||||
}
|
Loading…
Reference in a new issue