vendor: update goleveldb dependencies

This commit is contained in:
Fabian Reinartz 2016-07-04 10:08:49 +02:00
parent f4398d5bdf
commit bc506ce959
37 changed files with 1991 additions and 1405 deletions

View file

@ -12,8 +12,10 @@ import (
"github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/memdb" "github.com/syndtr/goleveldb/leveldb/memdb"
"github.com/syndtr/goleveldb/leveldb/storage"
) )
// ErrBatchCorrupted records reason of batch corruption.
type ErrBatchCorrupted struct { type ErrBatchCorrupted struct {
Reason string Reason string
} }
@ -23,7 +25,7 @@ func (e *ErrBatchCorrupted) Error() string {
} }
func newErrBatchCorrupted(reason string) error { func newErrBatchCorrupted(reason string) error {
return errors.NewErrCorrupted(nil, &ErrBatchCorrupted{reason}) return errors.NewErrCorrupted(storage.FileDesc{}, &ErrBatchCorrupted{reason})
} }
const ( const (
@ -31,6 +33,7 @@ const (
batchGrowRec = 3000 batchGrowRec = 3000
) )
// BatchReplay wraps basic batch operations.
type BatchReplay interface { type BatchReplay interface {
Put(key, value []byte) Put(key, value []byte)
Delete(key []byte) Delete(key []byte)
@ -67,20 +70,20 @@ func (b *Batch) grow(n int) {
} }
} }
func (b *Batch) appendRec(kt kType, key, value []byte) { func (b *Batch) appendRec(kt keyType, key, value []byte) {
n := 1 + binary.MaxVarintLen32 + len(key) n := 1 + binary.MaxVarintLen32 + len(key)
if kt == ktVal { if kt == keyTypeVal {
n += binary.MaxVarintLen32 + len(value) n += binary.MaxVarintLen32 + len(value)
} }
b.grow(n) b.grow(n)
off := len(b.data) off := len(b.data)
data := b.data[:off+n] data := b.data[:off+n]
data[off] = byte(kt) data[off] = byte(kt)
off += 1 off++
off += binary.PutUvarint(data[off:], uint64(len(key))) off += binary.PutUvarint(data[off:], uint64(len(key)))
copy(data[off:], key) copy(data[off:], key)
off += len(key) off += len(key)
if kt == ktVal { if kt == keyTypeVal {
off += binary.PutUvarint(data[off:], uint64(len(value))) off += binary.PutUvarint(data[off:], uint64(len(value)))
copy(data[off:], value) copy(data[off:], value)
off += len(value) off += len(value)
@ -94,13 +97,13 @@ func (b *Batch) appendRec(kt kType, key, value []byte) {
// Put appends 'put operation' of the given key/value pair to the batch. // Put appends 'put operation' of the given key/value pair to the batch.
// It is safe to modify the contents of the argument after Put returns. // It is safe to modify the contents of the argument after Put returns.
func (b *Batch) Put(key, value []byte) { func (b *Batch) Put(key, value []byte) {
b.appendRec(ktVal, key, value) b.appendRec(keyTypeVal, key, value)
} }
// Delete appends 'delete operation' of the given key to the batch. // Delete appends 'delete operation' of the given key to the batch.
// It is safe to modify the contents of the argument after Delete returns. // It is safe to modify the contents of the argument after Delete returns.
func (b *Batch) Delete(key []byte) { func (b *Batch) Delete(key []byte) {
b.appendRec(ktDel, key, nil) b.appendRec(keyTypeDel, key, nil)
} }
// Dump dumps batch contents. The returned slice can be loaded into the // Dump dumps batch contents. The returned slice can be loaded into the
@ -121,13 +124,14 @@ func (b *Batch) Load(data []byte) error {
// Replay replays batch contents. // Replay replays batch contents.
func (b *Batch) Replay(r BatchReplay) error { func (b *Batch) Replay(r BatchReplay) error {
return b.decodeRec(func(i int, kt kType, key, value []byte) { return b.decodeRec(func(i int, kt keyType, key, value []byte) error {
switch kt { switch kt {
case ktVal: case keyTypeVal:
r.Put(key, value) r.Put(key, value)
case ktDel: case keyTypeDel:
r.Delete(key) r.Delete(key)
} }
return nil
}) })
} }
@ -154,6 +158,7 @@ func (b *Batch) append(p *Batch) {
b.grow(len(p.data) - batchHdrLen) b.grow(len(p.data) - batchHdrLen)
b.data = append(b.data, p.data[batchHdrLen:]...) b.data = append(b.data, p.data[batchHdrLen:]...)
b.rLen += p.rLen b.rLen += p.rLen
b.bLen += p.bLen
} }
if p.sync { if p.sync {
b.sync = true b.sync = true
@ -193,18 +198,19 @@ func (b *Batch) decode(prevSeq uint64, data []byte) error {
return nil return nil
} }
func (b *Batch) decodeRec(f func(i int, kt kType, key, value []byte)) (err error) { func (b *Batch) decodeRec(f func(i int, kt keyType, key, value []byte) error) error {
off := batchHdrLen off := batchHdrLen
for i := 0; i < b.rLen; i++ { for i := 0; i < b.rLen; i++ {
if off >= len(b.data) { if off >= len(b.data) {
return newErrBatchCorrupted("invalid records length") return newErrBatchCorrupted("invalid records length")
} }
kt := kType(b.data[off]) kt := keyType(b.data[off])
if kt > ktVal { if kt > keyTypeVal {
panic(kt)
return newErrBatchCorrupted("bad record: invalid type") return newErrBatchCorrupted("bad record: invalid type")
} }
off += 1 off++
x, n := binary.Uvarint(b.data[off:]) x, n := binary.Uvarint(b.data[off:])
off += n off += n
@ -214,7 +220,7 @@ func (b *Batch) decodeRec(f func(i int, kt kType, key, value []byte)) (err error
key := b.data[off : off+int(x)] key := b.data[off : off+int(x)]
off += int(x) off += int(x)
var value []byte var value []byte
if kt == ktVal { if kt == keyTypeVal {
x, n := binary.Uvarint(b.data[off:]) x, n := binary.Uvarint(b.data[off:])
off += n off += n
if n <= 0 || off+int(x) > len(b.data) { if n <= 0 || off+int(x) > len(b.data) {
@ -224,16 +230,19 @@ func (b *Batch) decodeRec(f func(i int, kt kType, key, value []byte)) (err error
off += int(x) off += int(x)
} }
f(i, kt, key, value) if err := f(i, kt, key, value); err != nil {
return err
}
} }
return nil return nil
} }
func (b *Batch) memReplay(to *memdb.DB) error { func (b *Batch) memReplay(to *memdb.DB) error {
return b.decodeRec(func(i int, kt kType, key, value []byte) { var ikScratch []byte
ikey := newIkey(key, b.seq+uint64(i), kt) return b.decodeRec(func(i int, kt keyType, key, value []byte) error {
to.Put(ikey, value) ikScratch = makeInternalKey(ikScratch, key, b.seq+uint64(i), kt)
return to.Put(ikScratch, value)
}) })
} }
@ -245,8 +254,9 @@ func (b *Batch) memDecodeAndReplay(prevSeq uint64, data []byte, to *memdb.DB) er
} }
func (b *Batch) revertMemReplay(to *memdb.DB) error { func (b *Batch) revertMemReplay(to *memdb.DB) error {
return b.decodeRec(func(i int, kt kType, key, value []byte) { var ikScratch []byte
ikey := newIkey(key, b.seq+uint64(i), kt) return b.decodeRec(func(i int, kt keyType, key, value []byte) error {
to.Delete(ikey) ikScratch := makeInternalKey(ikScratch, key, b.seq+uint64(i), kt)
return to.Delete(ikScratch)
}) })
} }

View file

@ -47,17 +47,21 @@ type Cacher interface {
// so the the Release method will be called once object is released. // so the the Release method will be called once object is released.
type Value interface{} type Value interface{}
type CacheGetter struct { // NamespaceGetter provides convenient wrapper for namespace.
type NamespaceGetter struct {
Cache *Cache Cache *Cache
NS uint64 NS uint64
} }
func (g *CacheGetter) Get(key uint64, setFunc func() (size int, value Value)) *Handle { // Get simply calls Cache.Get() method.
func (g *NamespaceGetter) Get(key uint64, setFunc func() (size int, value Value)) *Handle {
return g.Cache.Get(g.NS, key, setFunc) return g.Cache.Get(g.NS, key, setFunc)
} }
// The hash tables implementation is based on: // The hash tables implementation is based on:
// "Dynamic-Sized Nonblocking Hash Tables", by Yujie Liu, Kunlong Zhang, and Michael Spear. ACM Symposium on Principles of Distributed Computing, Jul 2014. // "Dynamic-Sized Nonblocking Hash Tables", by Yujie Liu,
// Kunlong Zhang, and Michael Spear.
// ACM Symposium on Principles of Distributed Computing, Jul 2014.
const ( const (
mInitialSize = 1 << 4 mInitialSize = 1 << 4
@ -610,10 +614,12 @@ func (n *Node) unrefLocked() {
} }
} }
// Handle is a 'cache handle' of a 'cache node'.
type Handle struct { type Handle struct {
n unsafe.Pointer // *Node n unsafe.Pointer // *Node
} }
// Value returns the value of the 'cache node'.
func (h *Handle) Value() Value { func (h *Handle) Value() Value {
n := (*Node)(atomic.LoadPointer(&h.n)) n := (*Node)(atomic.LoadPointer(&h.n))
if n != nil { if n != nil {
@ -622,6 +628,8 @@ func (h *Handle) Value() Value {
return nil return nil
} }
// Release releases this 'cache handle'.
// It is safe to call release multiple times.
func (h *Handle) Release() { func (h *Handle) Release() {
nPtr := atomic.LoadPointer(&h.n) nPtr := atomic.LoadPointer(&h.n)
if nPtr != nil && atomic.CompareAndSwapPointer(&h.n, nPtr, nil) { if nPtr != nil && atomic.CompareAndSwapPointer(&h.n, nPtr, nil) {

View file

@ -33,9 +33,9 @@ func (icmp *iComparer) Name() string {
} }
func (icmp *iComparer) Compare(a, b []byte) int { func (icmp *iComparer) Compare(a, b []byte) int {
x := icmp.ucmp.Compare(iKey(a).ukey(), iKey(b).ukey()) x := icmp.ucmp.Compare(internalKey(a).ukey(), internalKey(b).ukey())
if x == 0 { if x == 0 {
if m, n := iKey(a).num(), iKey(b).num(); m > n { if m, n := internalKey(a).num(), internalKey(b).num(); m > n {
x = -1 x = -1
} else if m < n { } else if m < n {
x = 1 x = 1
@ -45,13 +45,13 @@ func (icmp *iComparer) Compare(a, b []byte) int {
} }
func (icmp *iComparer) Separator(dst, a, b []byte) []byte { func (icmp *iComparer) Separator(dst, a, b []byte) []byte {
ua, ub := iKey(a).ukey(), iKey(b).ukey() ua, ub := internalKey(a).ukey(), internalKey(b).ukey()
dst = icmp.ucmp.Separator(dst, ua, ub) dst = icmp.ucmp.Separator(dst, ua, ub)
if dst == nil { if dst == nil {
return nil return nil
} }
if len(dst) < len(ua) && icmp.uCompare(ua, dst) < 0 { if len(dst) < len(ua) && icmp.uCompare(ua, dst) < 0 {
dst = append(dst, kMaxNumBytes...) dst = append(dst, keyMaxNumBytes...)
} else { } else {
// Did not close possibilities that n maybe longer than len(ub). // Did not close possibilities that n maybe longer than len(ub).
dst = append(dst, a[len(a)-8:]...) dst = append(dst, a[len(a)-8:]...)
@ -60,13 +60,13 @@ func (icmp *iComparer) Separator(dst, a, b []byte) []byte {
} }
func (icmp *iComparer) Successor(dst, b []byte) []byte { func (icmp *iComparer) Successor(dst, b []byte) []byte {
ub := iKey(b).ukey() ub := internalKey(b).ukey()
dst = icmp.ucmp.Successor(dst, ub) dst = icmp.ucmp.Successor(dst, ub)
if dst == nil { if dst == nil {
return nil return nil
} }
if len(dst) < len(ub) && icmp.uCompare(ub, dst) < 0 { if len(dst) < len(ub) && icmp.uCompare(ub, dst) < 0 {
dst = append(dst, kMaxNumBytes...) dst = append(dst, keyMaxNumBytes...)
} else { } else {
// Did not close possibilities that n maybe longer than len(ub). // Did not close possibilities that n maybe longer than len(ub).
dst = append(dst, b[len(b)-8:]...) dst = append(dst, b[len(b)-8:]...)

View file

@ -41,8 +41,8 @@ type DB struct {
mem, frozenMem *memDB mem, frozenMem *memDB
journal *journal.Writer journal *journal.Writer
journalWriter storage.Writer journalWriter storage.Writer
journalFile storage.File journalFd storage.FileDesc
frozenJournalFile storage.File frozenJournalFd storage.FileDesc
frozenSeq uint64 frozenSeq uint64
// Snapshot. // Snapshot.
@ -61,8 +61,10 @@ type DB struct {
writeDelayN int writeDelayN int
journalC chan *Batch journalC chan *Batch
journalAckC chan error journalAckC chan error
tr *Transaction
// Compaction. // Compaction.
compCommitLk sync.Mutex
tcompCmdC chan cCmd tcompCmdC chan cCmd
tcompPauseC chan chan<- struct{} tcompPauseC chan chan<- struct{}
mcompCmdC chan cCmd mcompCmdC chan cCmd
@ -70,7 +72,8 @@ type DB struct {
compPerErrC chan error compPerErrC chan error
compErrSetC chan error compErrSetC chan error
compWriteLocking bool compWriteLocking bool
compStats []cStats compStats cStats
memdbMaxLevel int // For testing.
// Close. // Close.
closeW sync.WaitGroup closeW sync.WaitGroup
@ -104,7 +107,6 @@ func openDB(s *session) (*DB, error) {
compErrC: make(chan error), compErrC: make(chan error),
compPerErrC: make(chan error), compPerErrC: make(chan error),
compErrSetC: make(chan error), compErrSetC: make(chan error),
compStats: make([]cStats, s.o.GetNumLevel()),
// Close // Close
closeC: make(chan struct{}), closeC: make(chan struct{}),
} }
@ -209,7 +211,7 @@ func Open(stor storage.Storage, o *opt.Options) (db *DB, err error) {
// The returned DB instance is goroutine-safe. // The returned DB instance is goroutine-safe.
// The DB must be closed after use, by calling Close method. // The DB must be closed after use, by calling Close method.
func OpenFile(path string, o *opt.Options) (db *DB, err error) { func OpenFile(path string, o *opt.Options) (db *DB, err error) {
stor, err := storage.OpenFile(path) stor, err := storage.OpenFile(path, o.GetReadOnly())
if err != nil { if err != nil {
return return
} }
@ -259,7 +261,7 @@ func Recover(stor storage.Storage, o *opt.Options) (db *DB, err error) {
// The returned DB instance is goroutine-safe. // The returned DB instance is goroutine-safe.
// The DB must be closed after use, by calling Close method. // The DB must be closed after use, by calling Close method.
func RecoverFile(path string, o *opt.Options) (db *DB, err error) { func RecoverFile(path string, o *opt.Options) (db *DB, err error) {
stor, err := storage.OpenFile(path) stor, err := storage.OpenFile(path, false)
if err != nil { if err != nil {
return return
} }
@ -278,12 +280,11 @@ func recoverTable(s *session, o *opt.Options) error {
o.Strict &= ^opt.StrictReader o.Strict &= ^opt.StrictReader
// Get all tables and sort it by file number. // Get all tables and sort it by file number.
tableFiles_, err := s.getFiles(storage.TypeTable) fds, err := s.stor.List(storage.TypeTable)
if err != nil { if err != nil {
return err return err
} }
tableFiles := files(tableFiles_) sortFds(fds)
tableFiles.sort()
var ( var (
maxSeq uint64 maxSeq uint64
@ -296,17 +297,17 @@ func recoverTable(s *session, o *opt.Options) error {
rec = &sessionRecord{} rec = &sessionRecord{}
bpool = util.NewBufferPool(o.GetBlockSize() + 5) bpool = util.NewBufferPool(o.GetBlockSize() + 5)
) )
buildTable := func(iter iterator.Iterator) (tmp storage.File, size int64, err error) { buildTable := func(iter iterator.Iterator) (tmpFd storage.FileDesc, size int64, err error) {
tmp = s.newTemp() tmpFd = s.newTemp()
writer, err := tmp.Create() writer, err := s.stor.Create(tmpFd)
if err != nil { if err != nil {
return return
} }
defer func() { defer func() {
writer.Close() writer.Close()
if err != nil { if err != nil {
tmp.Remove() s.stor.Remove(tmpFd)
tmp = nil tmpFd = storage.FileDesc{}
} }
}() }()
@ -314,7 +315,7 @@ func recoverTable(s *session, o *opt.Options) error {
tw := table.NewWriter(writer, o) tw := table.NewWriter(writer, o)
for iter.Next() { for iter.Next() {
key := iter.Key() key := iter.Key()
if validIkey(key) { if validInternalKey(key) {
err = tw.Append(key, iter.Value()) err = tw.Append(key, iter.Value())
if err != nil { if err != nil {
return return
@ -338,9 +339,9 @@ func recoverTable(s *session, o *opt.Options) error {
size = int64(tw.BytesLen()) size = int64(tw.BytesLen())
return return
} }
recoverTable := func(file storage.File) error { recoverTable := func(fd storage.FileDesc) error {
s.logf("table@recovery recovering @%d", file.Num()) s.logf("table@recovery recovering @%d", fd.Num)
reader, err := file.Open() reader, err := s.stor.Open(fd)
if err != nil { if err != nil {
return err return err
} }
@ -362,7 +363,7 @@ func recoverTable(s *session, o *opt.Options) error {
tgoodKey, tcorruptedKey, tcorruptedBlock int tgoodKey, tcorruptedKey, tcorruptedBlock int
imin, imax []byte imin, imax []byte
) )
tr, err := table.NewReader(reader, size, storage.NewFileInfo(file), nil, bpool, o) tr, err := table.NewReader(reader, size, fd, nil, bpool, o)
if err != nil { if err != nil {
return err return err
} }
@ -370,7 +371,7 @@ func recoverTable(s *session, o *opt.Options) error {
if itererr, ok := iter.(iterator.ErrorCallbackSetter); ok { if itererr, ok := iter.(iterator.ErrorCallbackSetter); ok {
itererr.SetErrorCallback(func(err error) { itererr.SetErrorCallback(func(err error) {
if errors.IsCorrupted(err) { if errors.IsCorrupted(err) {
s.logf("table@recovery block corruption @%d %q", file.Num(), err) s.logf("table@recovery block corruption @%d %q", fd.Num, err)
tcorruptedBlock++ tcorruptedBlock++
} }
}) })
@ -379,7 +380,7 @@ func recoverTable(s *session, o *opt.Options) error {
// Scan the table. // Scan the table.
for iter.Next() { for iter.Next() {
key := iter.Key() key := iter.Key()
_, seq, _, kerr := parseIkey(key) _, seq, _, kerr := parseInternalKey(key)
if kerr != nil { if kerr != nil {
tcorruptedKey++ tcorruptedKey++
continue continue
@ -405,23 +406,23 @@ func recoverTable(s *session, o *opt.Options) error {
if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) { if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) {
droppedTable++ droppedTable++
s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", file.Num(), tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq) s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
return nil return nil
} }
if tgoodKey > 0 { if tgoodKey > 0 {
if tcorruptedKey > 0 || tcorruptedBlock > 0 { if tcorruptedKey > 0 || tcorruptedBlock > 0 {
// Rebuild the table. // Rebuild the table.
s.logf("table@recovery rebuilding @%d", file.Num()) s.logf("table@recovery rebuilding @%d", fd.Num)
iter := tr.NewIterator(nil, nil) iter := tr.NewIterator(nil, nil)
tmp, newSize, err := buildTable(iter) tmpFd, newSize, err := buildTable(iter)
iter.Release() iter.Release()
if err != nil { if err != nil {
return err return err
} }
closed = true closed = true
reader.Close() reader.Close()
if err := file.Replace(tmp); err != nil { if err := s.stor.Rename(tmpFd, fd); err != nil {
return err return err
} }
size = newSize size = newSize
@ -431,30 +432,30 @@ func recoverTable(s *session, o *opt.Options) error {
} }
recoveredKey += tgoodKey recoveredKey += tgoodKey
// Add table to level 0. // Add table to level 0.
rec.addTable(0, file.Num(), uint64(size), imin, imax) rec.addTable(0, fd.Num, size, imin, imax)
s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", file.Num(), tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq) s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
} else { } else {
droppedTable++ droppedTable++
s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", file.Num(), tcorruptedKey, tcorruptedBlock, size) s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", fd.Num, tcorruptedKey, tcorruptedBlock, size)
} }
return nil return nil
} }
// Recover all tables. // Recover all tables.
if len(tableFiles) > 0 { if len(fds) > 0 {
s.logf("table@recovery F·%d", len(tableFiles)) s.logf("table@recovery F·%d", len(fds))
// Mark file number as used. // Mark file number as used.
s.markFileNum(tableFiles[len(tableFiles)-1].Num()) s.markFileNum(fds[len(fds)-1].Num)
for _, file := range tableFiles { for _, fd := range fds {
if err := recoverTable(file); err != nil { if err := recoverTable(fd); err != nil {
return err return err
} }
} }
s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(tableFiles), recoveredKey, goodKey, corruptedKey, maxSeq) s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(fds), recoveredKey, goodKey, corruptedKey, maxSeq)
} }
// Set sequence number. // Set sequence number.
@ -471,31 +472,31 @@ func recoverTable(s *session, o *opt.Options) error {
func (db *DB) recoverJournal() error { func (db *DB) recoverJournal() error {
// Get all journals and sort it by file number. // Get all journals and sort it by file number.
allJournalFiles, err := db.s.getFiles(storage.TypeJournal) rawFds, err := db.s.stor.List(storage.TypeJournal)
if err != nil { if err != nil {
return err return err
} }
files(allJournalFiles).sort() sortFds(rawFds)
// Journals that will be recovered. // Journals that will be recovered.
var recJournalFiles []storage.File var fds []storage.FileDesc
for _, jf := range allJournalFiles { for _, fd := range rawFds {
if jf.Num() >= db.s.stJournalNum || jf.Num() == db.s.stPrevJournalNum { if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum {
recJournalFiles = append(recJournalFiles, jf) fds = append(fds, fd)
} }
} }
var ( var (
of storage.File // Obsolete file. ofd storage.FileDesc // Obsolete file.
rec = &sessionRecord{} rec = &sessionRecord{}
) )
// Recover journals. // Recover journals.
if len(recJournalFiles) > 0 { if len(fds) > 0 {
db.logf("journal@recovery F·%d", len(recJournalFiles)) db.logf("journal@recovery F·%d", len(fds))
// Mark file number as used. // Mark file number as used.
db.s.markFileNum(recJournalFiles[len(recJournalFiles)-1].Num()) db.s.markFileNum(fds[len(fds)-1].Num)
var ( var (
// Options. // Options.
@ -509,31 +510,31 @@ func (db *DB) recoverJournal() error {
batch = &Batch{} batch = &Batch{}
) )
for _, jf := range recJournalFiles { for _, fd := range fds {
db.logf("journal@recovery recovering @%d", jf.Num()) db.logf("journal@recovery recovering @%d", fd.Num)
fr, err := jf.Open() fr, err := db.s.stor.Open(fd)
if err != nil { if err != nil {
return err return err
} }
// Create or reset journal reader instance. // Create or reset journal reader instance.
if jr == nil { if jr == nil {
jr = journal.NewReader(fr, dropper{db.s, jf}, strict, checksum) jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum)
} else { } else {
jr.Reset(fr, dropper{db.s, jf}, strict, checksum) jr.Reset(fr, dropper{db.s, fd}, strict, checksum)
} }
// Flush memdb and remove obsolete journal file. // Flush memdb and remove obsolete journal file.
if of != nil { if !ofd.Nil() {
if mdb.Len() > 0 { if mdb.Len() > 0 {
if _, err := db.s.flushMemdb(rec, mdb, -1); err != nil { if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil {
fr.Close() fr.Close()
return err return err
} }
} }
rec.setJournalNum(jf.Num()) rec.setJournalNum(fd.Num)
rec.setSeqNum(db.seq) rec.setSeqNum(db.seq)
if err := db.s.commit(rec); err != nil { if err := db.s.commit(rec); err != nil {
fr.Close() fr.Close()
@ -541,8 +542,8 @@ func (db *DB) recoverJournal() error {
} }
rec.resetAddedTables() rec.resetAddedTables()
of.Remove() db.s.stor.Remove(ofd)
of = nil ofd = storage.FileDesc{}
} }
// Replay journal to memdb. // Replay journal to memdb.
@ -555,7 +556,7 @@ func (db *DB) recoverJournal() error {
} }
fr.Close() fr.Close()
return errors.SetFile(err, jf) return errors.SetFd(err, fd)
} }
buf.Reset() buf.Reset()
@ -566,7 +567,7 @@ func (db *DB) recoverJournal() error {
} }
fr.Close() fr.Close()
return errors.SetFile(err, jf) return errors.SetFd(err, fd)
} }
if err := batch.memDecodeAndReplay(db.seq, buf.Bytes(), mdb); err != nil { if err := batch.memDecodeAndReplay(db.seq, buf.Bytes(), mdb); err != nil {
if !strict && errors.IsCorrupted(err) { if !strict && errors.IsCorrupted(err) {
@ -576,7 +577,7 @@ func (db *DB) recoverJournal() error {
} }
fr.Close() fr.Close()
return errors.SetFile(err, jf) return errors.SetFd(err, fd)
} }
// Save sequence number. // Save sequence number.
@ -594,7 +595,7 @@ func (db *DB) recoverJournal() error {
} }
fr.Close() fr.Close()
of = jf ofd = fd
} }
// Flush the last memdb. // Flush the last memdb.
@ -611,7 +612,7 @@ func (db *DB) recoverJournal() error {
} }
// Commit. // Commit.
rec.setJournalNum(db.journalFile.Num()) rec.setJournalNum(db.journalFd.Num)
rec.setSeqNum(db.seq) rec.setSeqNum(db.seq)
if err := db.s.commit(rec); err != nil { if err := db.s.commit(rec); err != nil {
// Close journal on error. // Close journal on error.
@ -623,8 +624,8 @@ func (db *DB) recoverJournal() error {
} }
// Remove the last obsolete journal file. // Remove the last obsolete journal file.
if of != nil { if !ofd.Nil() {
of.Remove() db.s.stor.Remove(ofd)
} }
return nil return nil
@ -632,17 +633,17 @@ func (db *DB) recoverJournal() error {
func (db *DB) recoverJournalRO() error { func (db *DB) recoverJournalRO() error {
// Get all journals and sort it by file number. // Get all journals and sort it by file number.
allJournalFiles, err := db.s.getFiles(storage.TypeJournal) rawFds, err := db.s.stor.List(storage.TypeJournal)
if err != nil { if err != nil {
return err return err
} }
files(allJournalFiles).sort() sortFds(rawFds)
// Journals that will be recovered. // Journals that will be recovered.
var recJournalFiles []storage.File var fds []storage.FileDesc
for _, jf := range allJournalFiles { for _, fd := range rawFds {
if jf.Num() >= db.s.stJournalNum || jf.Num() == db.s.stPrevJournalNum { if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum {
recJournalFiles = append(recJournalFiles, jf) fds = append(fds, fd)
} }
} }
@ -656,8 +657,8 @@ func (db *DB) recoverJournalRO() error {
) )
// Recover journals. // Recover journals.
if len(recJournalFiles) > 0 { if len(fds) > 0 {
db.logf("journal@recovery RO·Mode F·%d", len(recJournalFiles)) db.logf("journal@recovery RO·Mode F·%d", len(fds))
var ( var (
jr *journal.Reader jr *journal.Reader
@ -665,19 +666,19 @@ func (db *DB) recoverJournalRO() error {
batch = &Batch{} batch = &Batch{}
) )
for _, jf := range recJournalFiles { for _, fd := range fds {
db.logf("journal@recovery recovering @%d", jf.Num()) db.logf("journal@recovery recovering @%d", fd.Num)
fr, err := jf.Open() fr, err := db.s.stor.Open(fd)
if err != nil { if err != nil {
return err return err
} }
// Create or reset journal reader instance. // Create or reset journal reader instance.
if jr == nil { if jr == nil {
jr = journal.NewReader(fr, dropper{db.s, jf}, strict, checksum) jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum)
} else { } else {
jr.Reset(fr, dropper{db.s, jf}, strict, checksum) jr.Reset(fr, dropper{db.s, fd}, strict, checksum)
} }
// Replay journal to memdb. // Replay journal to memdb.
@ -689,7 +690,7 @@ func (db *DB) recoverJournalRO() error {
} }
fr.Close() fr.Close()
return errors.SetFile(err, jf) return errors.SetFd(err, fd)
} }
buf.Reset() buf.Reset()
@ -700,7 +701,7 @@ func (db *DB) recoverJournalRO() error {
} }
fr.Close() fr.Close()
return errors.SetFile(err, jf) return errors.SetFd(err, fd)
} }
if err := batch.memDecodeAndReplay(db.seq, buf.Bytes(), mdb); err != nil { if err := batch.memDecodeAndReplay(db.seq, buf.Bytes(), mdb); err != nil {
if !strict && errors.IsCorrupted(err) { if !strict && errors.IsCorrupted(err) {
@ -710,7 +711,7 @@ func (db *DB) recoverJournalRO() error {
} }
fr.Close() fr.Close()
return errors.SetFile(err, jf) return errors.SetFd(err, fd)
} }
// Save sequence number. // Save sequence number.
@ -727,46 +728,35 @@ func (db *DB) recoverJournalRO() error {
return nil return nil
} }
func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) { func memGet(mdb *memdb.DB, ikey internalKey, icmp *iComparer) (ok bool, mv []byte, err error) {
ikey := newIkey(key, seq, ktSeek) mk, mv, err := mdb.Find(ikey)
if err == nil {
em, fm := db.getMems() ukey, _, kt, kerr := parseInternalKey(mk)
for _, m := range [...]*memDB{em, fm} {
if m == nil {
continue
}
defer m.decref()
mk, mv, me := m.Find(ikey)
if me == nil {
ukey, _, kt, kerr := parseIkey(mk)
if kerr != nil { if kerr != nil {
// Shouldn't have had happen. // Shouldn't have had happen.
panic(kerr) panic(kerr)
} }
if db.s.icmp.uCompare(ukey, key) == 0 { if icmp.uCompare(ukey, ikey.ukey()) == 0 {
if kt == ktDel { if kt == keyTypeDel {
return nil, ErrNotFound return true, nil, ErrNotFound
}
return append([]byte{}, mv...), nil
}
} else if me != ErrNotFound {
return nil, me
}
} }
return true, mv, nil
v := db.s.version() }
value, cSched, err := v.get(ikey, ro, false) } else if err != ErrNotFound {
v.release() return true, nil, err
if cSched {
// Trigger table compaction.
db.compSendTrigger(db.tcompCmdC)
} }
return return
} }
func (db *DB) has(key []byte, seq uint64, ro *opt.ReadOptions) (ret bool, err error) { func (db *DB) get(auxm *memdb.DB, auxt tFiles, key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) {
ikey := newIkey(key, seq, ktSeek) ikey := makeInternalKey(nil, key, seq, keyTypeSeek)
if auxm != nil {
if ok, mv, me := memGet(auxm, ikey, db.s.icmp); ok {
return append([]byte{}, mv...), me
}
}
em, fm := db.getMems() em, fm := db.getMems()
for _, m := range [...]*memDB{em, fm} { for _, m := range [...]*memDB{em, fm} {
@ -775,30 +765,55 @@ func (db *DB) has(key []byte, seq uint64, ro *opt.ReadOptions) (ret bool, err er
} }
defer m.decref() defer m.decref()
mk, _, me := m.Find(ikey) if ok, mv, me := memGet(m.DB, ikey, db.s.icmp); ok {
if me == nil { return append([]byte{}, mv...), me
ukey, _, kt, kerr := parseIkey(mk)
if kerr != nil {
// Shouldn't have had happen.
panic(kerr)
}
if db.s.icmp.uCompare(ukey, key) == 0 {
if kt == ktDel {
return false, nil
}
return true, nil
}
} else if me != ErrNotFound {
return false, me
} }
} }
v := db.s.version() v := db.s.version()
_, cSched, err := v.get(ikey, ro, true) value, cSched, err := v.get(auxt, ikey, ro, false)
v.release() v.release()
if cSched { if cSched {
// Trigger table compaction. // Trigger table compaction.
db.compSendTrigger(db.tcompCmdC) db.compTrigger(db.tcompCmdC)
}
return
}
func nilIfNotFound(err error) error {
if err == ErrNotFound {
return nil
}
return err
}
func (db *DB) has(auxm *memdb.DB, auxt tFiles, key []byte, seq uint64, ro *opt.ReadOptions) (ret bool, err error) {
ikey := makeInternalKey(nil, key, seq, keyTypeSeek)
if auxm != nil {
if ok, _, me := memGet(auxm, ikey, db.s.icmp); ok {
return me == nil, nilIfNotFound(me)
}
}
em, fm := db.getMems()
for _, m := range [...]*memDB{em, fm} {
if m == nil {
continue
}
defer m.decref()
if ok, _, me := memGet(m.DB, ikey, db.s.icmp); ok {
return me == nil, nilIfNotFound(me)
}
}
v := db.s.version()
_, cSched, err := v.get(auxt, ikey, ro, true)
v.release()
if cSched {
// Trigger table compaction.
db.compTrigger(db.tcompCmdC)
} }
if err == nil { if err == nil {
ret = true ret = true
@ -822,7 +837,7 @@ func (db *DB) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) {
se := db.acquireSnapshot() se := db.acquireSnapshot()
defer db.releaseSnapshot(se) defer db.releaseSnapshot(se)
return db.get(key, se.seq, ro) return db.get(nil, nil, key, se.seq, ro)
} }
// Has returns true if the DB does contains the given key. // Has returns true if the DB does contains the given key.
@ -836,11 +851,11 @@ func (db *DB) Has(key []byte, ro *opt.ReadOptions) (ret bool, err error) {
se := db.acquireSnapshot() se := db.acquireSnapshot()
defer db.releaseSnapshot(se) defer db.releaseSnapshot(se)
return db.has(key, se.seq, ro) return db.has(nil, nil, key, se.seq, ro)
} }
// NewIterator returns an iterator for the latest snapshot of the // NewIterator returns an iterator for the latest snapshot of the
// uderlying DB. // underlying DB.
// The returned iterator is not goroutine-safe, but it is safe to use // The returned iterator is not goroutine-safe, but it is safe to use
// multiple iterators concurrently, with each in a dedicated goroutine. // multiple iterators concurrently, with each in a dedicated goroutine.
// It is also safe to use an iterator concurrently with modifying its // It is also safe to use an iterator concurrently with modifying its
@ -864,7 +879,7 @@ func (db *DB) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Itera
defer db.releaseSnapshot(se) defer db.releaseSnapshot(se)
// Iterator holds 'version' lock, 'version' is immutable so snapshot // Iterator holds 'version' lock, 'version' is immutable so snapshot
// can be released after iterator created. // can be released after iterator created.
return db.newIterator(se.seq, slice, ro) return db.newIterator(nil, nil, se.seq, slice, ro)
} }
// GetSnapshot returns a latest snapshot of the underlying DB. A snapshot // GetSnapshot returns a latest snapshot of the underlying DB. A snapshot
@ -920,7 +935,7 @@ func (db *DB) GetProperty(name string) (value string, err error) {
var level uint var level uint
var rest string var rest string
n, _ := fmt.Sscanf(p[len(numFilesPrefix):], "%d%s", &level, &rest) n, _ := fmt.Sscanf(p[len(numFilesPrefix):], "%d%s", &level, &rest)
if n != 1 || int(level) >= db.s.o.GetNumLevel() { if n != 1 {
err = ErrNotFound err = ErrNotFound
} else { } else {
value = fmt.Sprint(v.tLen(int(level))) value = fmt.Sprint(v.tLen(int(level)))
@ -929,8 +944,8 @@ func (db *DB) GetProperty(name string) (value string, err error) {
value = "Compactions\n" + value = "Compactions\n" +
" Level | Tables | Size(MB) | Time(sec) | Read(MB) | Write(MB)\n" + " Level | Tables | Size(MB) | Time(sec) | Read(MB) | Write(MB)\n" +
"-------+------------+---------------+---------------+---------------+---------------\n" "-------+------------+---------------+---------------+---------------+---------------\n"
for level, tables := range v.tables { for level, tables := range v.levels {
duration, read, write := db.compStats[level].get() duration, read, write := db.compStats.getStat(level)
if len(tables) == 0 && duration == 0 { if len(tables) == 0 && duration == 0 {
continue continue
} }
@ -939,10 +954,10 @@ func (db *DB) GetProperty(name string) (value string, err error) {
float64(read)/1048576.0, float64(write)/1048576.0) float64(read)/1048576.0, float64(write)/1048576.0)
} }
case p == "sstables": case p == "sstables":
for level, tables := range v.tables { for level, tables := range v.levels {
value += fmt.Sprintf("--- level %d ---\n", level) value += fmt.Sprintf("--- level %d ---\n", level)
for _, t := range tables { for _, t := range tables {
value += fmt.Sprintf("%d:%d[%q .. %q]\n", t.file.Num(), t.size, t.imin, t.imax) value += fmt.Sprintf("%d:%d[%q .. %q]\n", t.fd.Num, t.size, t.imin, t.imax)
} }
} }
case p == "blockpool": case p == "blockpool":
@ -982,8 +997,8 @@ func (db *DB) SizeOf(ranges []util.Range) (Sizes, error) {
sizes := make(Sizes, 0, len(ranges)) sizes := make(Sizes, 0, len(ranges))
for _, r := range ranges { for _, r := range ranges {
imin := newIkey(r.Start, kMaxSeq, ktSeek) imin := makeInternalKey(nil, r.Start, keyMaxSeq, keyTypeSeek)
imax := newIkey(r.Limit, kMaxSeq, ktSeek) imax := makeInternalKey(nil, r.Limit, keyMaxSeq, keyTypeSeek)
start, err := v.offsetOf(imin) start, err := v.offsetOf(imin)
if err != nil { if err != nil {
return nil, err return nil, err
@ -992,7 +1007,7 @@ func (db *DB) SizeOf(ranges []util.Range) (Sizes, error) {
if err != nil { if err != nil {
return nil, err return nil, err
} }
var size uint64 var size int64
if limit >= start { if limit >= start {
size = limit - start size = limit - start
} }
@ -1002,8 +1017,8 @@ func (db *DB) SizeOf(ranges []util.Range) (Sizes, error) {
return sizes, nil return sizes, nil
} }
// Close closes the DB. This will also releases any outstanding snapshot and // Close closes the DB. This will also releases any outstanding snapshot,
// abort any in-flight compaction. // abort any in-flight compaction and discard open transaction.
// //
// It is not safe to close a DB until all outstanding iterators are released. // It is not safe to close a DB until all outstanding iterators are released.
// It is valid to call Close multiple times. Other methods should not be // It is valid to call Close multiple times. Other methods should not be
@ -1032,11 +1047,18 @@ func (db *DB) Close() error {
// Signal all goroutines. // Signal all goroutines.
close(db.closeC) close(db.closeC)
// Discard open transaction.
if db.tr != nil {
db.tr.Discard()
}
// Acquire writer lock.
db.writeLockC <- struct{}{}
// Wait for all gorotines to exit. // Wait for all gorotines to exit.
db.closeW.Wait() db.closeW.Wait()
// Lock writer and closes journal. // Closes journal.
db.writeLockC <- struct{}{}
if db.journal != nil { if db.journal != nil {
db.journal.Close() db.journal.Close()
db.journalWriter.Close() db.journalWriter.Close()
@ -1063,8 +1085,6 @@ func (db *DB) Close() error {
db.frozenMem = nil db.frozenMem = nil
db.journal = nil db.journal = nil
db.journalWriter = nil db.journalWriter = nil
db.journalFile = nil
db.frozenJournalFile = nil
db.closer = nil db.closer = nil
return err return err

View file

@ -12,55 +12,76 @@ import (
"github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/opt" "github.com/syndtr/goleveldb/leveldb/opt"
"github.com/syndtr/goleveldb/leveldb/storage"
) )
var ( var (
errCompactionTransactExiting = errors.New("leveldb: compaction transact exiting") errCompactionTransactExiting = errors.New("leveldb: compaction transact exiting")
) )
type cStats struct { type cStat struct {
sync.Mutex
duration time.Duration duration time.Duration
read uint64 read int64
write uint64 write int64
} }
func (p *cStats) add(n *cStatsStaging) { func (p *cStat) add(n *cStatStaging) {
p.Lock()
p.duration += n.duration p.duration += n.duration
p.read += n.read p.read += n.read
p.write += n.write p.write += n.write
p.Unlock()
} }
func (p *cStats) get() (duration time.Duration, read, write uint64) { func (p *cStat) get() (duration time.Duration, read, write int64) {
p.Lock()
defer p.Unlock()
return p.duration, p.read, p.write return p.duration, p.read, p.write
} }
type cStatsStaging struct { type cStatStaging struct {
start time.Time start time.Time
duration time.Duration duration time.Duration
on bool on bool
read uint64 read int64
write uint64 write int64
} }
func (p *cStatsStaging) startTimer() { func (p *cStatStaging) startTimer() {
if !p.on { if !p.on {
p.start = time.Now() p.start = time.Now()
p.on = true p.on = true
} }
} }
func (p *cStatsStaging) stopTimer() { func (p *cStatStaging) stopTimer() {
if p.on { if p.on {
p.duration += time.Since(p.start) p.duration += time.Since(p.start)
p.on = false p.on = false
} }
} }
type cStats struct {
lk sync.Mutex
stats []cStat
}
func (p *cStats) addStat(level int, n *cStatStaging) {
p.lk.Lock()
if level >= len(p.stats) {
newStats := make([]cStat, level+1)
copy(newStats, p.stats)
p.stats = newStats
}
p.stats[level].add(n)
p.lk.Unlock()
}
func (p *cStats) getStat(level int) (duration time.Duration, read, write int64) {
p.lk.Lock()
defer p.lk.Unlock()
if level < len(p.stats) {
return p.stats[level].get()
}
return
}
func (db *DB) compactionError() { func (db *DB) compactionError() {
var err error var err error
noerr: noerr:
@ -151,7 +172,7 @@ func (db *DB) compactionTransact(name string, t compactionTransactInterface) {
disableBackoff = db.s.o.GetDisableCompactionBackoff() disableBackoff = db.s.o.GetDisableCompactionBackoff()
) )
for n := 0; ; n++ { for n := 0; ; n++ {
// Check wether the DB is closed. // Check whether the DB is closed.
if db.isClosed() { if db.isClosed() {
db.logf("%s exiting", name) db.logf("%s exiting", name)
db.compactionExitTransact() db.compactionExitTransact()
@ -235,6 +256,14 @@ func (db *DB) compactionExitTransact() {
panic(errCompactionTransactExiting) panic(errCompactionTransactExiting)
} }
func (db *DB) compactionCommit(name string, rec *sessionRecord) {
db.compCommitLk.Lock()
defer db.compCommitLk.Unlock() // Defer is necessary.
db.compactionTransactFunc(name+"@commit", func(cnt *compactionTransactCounter) error {
return db.s.commit(rec)
}, nil)
}
func (db *DB) memCompaction() { func (db *DB) memCompaction() {
mdb := db.getFrozenMem() mdb := db.getFrozenMem()
if mdb == nil { if mdb == nil {
@ -265,41 +294,40 @@ func (db *DB) memCompaction() {
var ( var (
rec = &sessionRecord{} rec = &sessionRecord{}
stats = &cStatsStaging{} stats = &cStatStaging{}
flushLevel int flushLevel int
) )
// Generate tables.
db.compactionTransactFunc("memdb@flush", func(cnt *compactionTransactCounter) (err error) { db.compactionTransactFunc("memdb@flush", func(cnt *compactionTransactCounter) (err error) {
stats.startTimer() stats.startTimer()
flushLevel, err = db.s.flushMemdb(rec, mdb.DB, -1) flushLevel, err = db.s.flushMemdb(rec, mdb.DB, db.memdbMaxLevel)
stats.stopTimer() stats.stopTimer()
return return
}, func() error { }, func() error {
for _, r := range rec.addedTables { for _, r := range rec.addedTables {
db.logf("memdb@flush revert @%d", r.num) db.logf("memdb@flush revert @%d", r.num)
f := db.s.getTableFile(r.num) if err := db.s.stor.Remove(storage.FileDesc{Type: storage.TypeTable, Num: r.num}); err != nil {
if err := f.Remove(); err != nil {
return err return err
} }
} }
return nil return nil
}) })
db.compactionTransactFunc("memdb@commit", func(cnt *compactionTransactCounter) (err error) { rec.setJournalNum(db.journalFd.Num)
stats.startTimer()
rec.setJournalNum(db.journalFile.Num())
rec.setSeqNum(db.frozenSeq) rec.setSeqNum(db.frozenSeq)
err = db.s.commit(rec)
// Commit.
stats.startTimer()
db.compactionCommit("memdb", rec)
stats.stopTimer() stats.stopTimer()
return
}, nil)
db.logf("memdb@flush committed F·%d T·%v", len(rec.addedTables), stats.duration) db.logf("memdb@flush committed F·%d T·%v", len(rec.addedTables), stats.duration)
for _, r := range rec.addedTables { for _, r := range rec.addedTables {
stats.write += r.size stats.write += r.size
} }
db.compStats[flushLevel].add(stats) db.compStats.addStat(flushLevel, stats)
// Drop frozen memdb. // Drop frozen memdb.
db.dropFrozenMem() db.dropFrozenMem()
@ -315,7 +343,7 @@ func (db *DB) memCompaction() {
} }
// Trigger table compaction. // Trigger table compaction.
db.compSendTrigger(db.tcompCmdC) db.compTrigger(db.tcompCmdC)
} }
type tableCompactionBuilder struct { type tableCompactionBuilder struct {
@ -323,7 +351,7 @@ type tableCompactionBuilder struct {
s *session s *session
c *compaction c *compaction
rec *sessionRecord rec *sessionRecord
stat0, stat1 *cStatsStaging stat0, stat1 *cStatStaging
snapHasLastUkey bool snapHasLastUkey bool
snapLastUkey []byte snapLastUkey []byte
@ -377,9 +405,9 @@ func (b *tableCompactionBuilder) flush() error {
if err != nil { if err != nil {
return err return err
} }
b.rec.addTableFile(b.c.level+1, t) b.rec.addTableFile(b.c.sourceLevel+1, t)
b.stat1.write += t.size b.stat1.write += t.size
b.s.logf("table@build created L%d@%d N·%d S·%s %q:%q", b.c.level+1, t.file.Num(), b.tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax) b.s.logf("table@build created L%d@%d N·%d S·%s %q:%q", b.c.sourceLevel+1, t.fd.Num, b.tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax)
b.tw = nil b.tw = nil
return nil return nil
} }
@ -424,7 +452,7 @@ func (b *tableCompactionBuilder) run(cnt *compactionTransactCounter) error {
} }
ikey := iter.Key() ikey := iter.Key()
ukey, seq, kt, kerr := parseIkey(ikey) ukey, seq, kt, kerr := parseInternalKey(ikey)
if kerr == nil { if kerr == nil {
shouldStop := !resumed && b.c.shouldStopBefore(ikey) shouldStop := !resumed && b.c.shouldStopBefore(ikey)
@ -450,14 +478,14 @@ func (b *tableCompactionBuilder) run(cnt *compactionTransactCounter) error {
hasLastUkey = true hasLastUkey = true
lastUkey = append(lastUkey[:0], ukey...) lastUkey = append(lastUkey[:0], ukey...)
lastSeq = kMaxSeq lastSeq = keyMaxSeq
} }
switch { switch {
case lastSeq <= b.minSeq: case lastSeq <= b.minSeq:
// Dropped because newer entry for same user key exist // Dropped because newer entry for same user key exist
fallthrough // (A) fallthrough // (A)
case kt == ktDel && seq <= b.minSeq && b.c.baseLevelForKey(lastUkey): case kt == keyTypeDel && seq <= b.minSeq && b.c.baseLevelForKey(lastUkey):
// For this user key: // For this user key:
// (1) there is no data in higher levels // (1) there is no data in higher levels
// (2) data in lower levels will have larger seq numbers // (2) data in lower levels will have larger seq numbers
@ -479,7 +507,7 @@ func (b *tableCompactionBuilder) run(cnt *compactionTransactCounter) error {
// Don't drop corrupted keys. // Don't drop corrupted keys.
hasLastUkey = false hasLastUkey = false
lastUkey = lastUkey[:0] lastUkey = lastUkey[:0]
lastSeq = kMaxSeq lastSeq = keyMaxSeq
b.kerrCnt++ b.kerrCnt++
} }
@ -502,8 +530,7 @@ func (b *tableCompactionBuilder) run(cnt *compactionTransactCounter) error {
func (b *tableCompactionBuilder) revert() error { func (b *tableCompactionBuilder) revert() error {
for _, at := range b.rec.addedTables { for _, at := range b.rec.addedTables {
b.s.logf("table@build revert @%d", at.num) b.s.logf("table@build revert @%d", at.num)
f := b.s.getTableFile(at.num) if err := b.s.stor.Remove(storage.FileDesc{Type: storage.TypeTable, Num: at.num}); err != nil {
if err := f.Remove(); err != nil {
return err return err
} }
} }
@ -514,30 +541,28 @@ func (db *DB) tableCompaction(c *compaction, noTrivial bool) {
defer c.release() defer c.release()
rec := &sessionRecord{} rec := &sessionRecord{}
rec.addCompPtr(c.level, c.imax) rec.addCompPtr(c.sourceLevel, c.imax)
if !noTrivial && c.trivial() { if !noTrivial && c.trivial() {
t := c.tables[0][0] t := c.levels[0][0]
db.logf("table@move L%d@%d -> L%d", c.level, t.file.Num(), c.level+1) db.logf("table@move L%d@%d -> L%d", c.sourceLevel, t.fd.Num, c.sourceLevel+1)
rec.delTable(c.level, t.file.Num()) rec.delTable(c.sourceLevel, t.fd.Num)
rec.addTableFile(c.level+1, t) rec.addTableFile(c.sourceLevel+1, t)
db.compactionTransactFunc("table@move", func(cnt *compactionTransactCounter) (err error) { db.compactionCommit("table-move", rec)
return db.s.commit(rec)
}, nil)
return return
} }
var stats [2]cStatsStaging var stats [2]cStatStaging
for i, tables := range c.tables { for i, tables := range c.levels {
for _, t := range tables { for _, t := range tables {
stats[i].read += t.size stats[i].read += t.size
// Insert deleted tables into record // Insert deleted tables into record
rec.delTable(c.level+i, t.file.Num()) rec.delTable(c.sourceLevel+i, t.fd.Num)
} }
} }
sourceSize := int(stats[0].read + stats[1].read) sourceSize := int(stats[0].read + stats[1].read)
minSeq := db.minSeq() minSeq := db.minSeq()
db.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.level, len(c.tables[0]), c.level+1, len(c.tables[1]), shortenb(sourceSize), minSeq) db.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.sourceLevel, len(c.levels[0]), c.sourceLevel+1, len(c.levels[1]), shortenb(sourceSize), minSeq)
b := &tableCompactionBuilder{ b := &tableCompactionBuilder{
db: db, db: db,
@ -547,49 +572,60 @@ func (db *DB) tableCompaction(c *compaction, noTrivial bool) {
stat1: &stats[1], stat1: &stats[1],
minSeq: minSeq, minSeq: minSeq,
strict: db.s.o.GetStrict(opt.StrictCompaction), strict: db.s.o.GetStrict(opt.StrictCompaction),
tableSize: db.s.o.GetCompactionTableSize(c.level + 1), tableSize: db.s.o.GetCompactionTableSize(c.sourceLevel + 1),
} }
db.compactionTransact("table@build", b) db.compactionTransact("table@build", b)
// Commit changes // Commit.
db.compactionTransactFunc("table@commit", func(cnt *compactionTransactCounter) (err error) {
stats[1].startTimer() stats[1].startTimer()
defer stats[1].stopTimer() db.compactionCommit("table", rec)
return db.s.commit(rec) stats[1].stopTimer()
}, nil)
resultSize := int(stats[1].write) resultSize := int(stats[1].write)
db.logf("table@compaction committed F%s S%s Ke·%d D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), b.kerrCnt, b.dropCnt, stats[1].duration) db.logf("table@compaction committed F%s S%s Ke·%d D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), b.kerrCnt, b.dropCnt, stats[1].duration)
// Save compaction stats // Save compaction stats
for i := range stats { for i := range stats {
db.compStats[c.level+1].add(&stats[i]) db.compStats.addStat(c.sourceLevel+1, &stats[i])
} }
} }
func (db *DB) tableRangeCompaction(level int, umin, umax []byte) { func (db *DB) tableRangeCompaction(level int, umin, umax []byte) error {
db.logf("table@compaction range L%d %q:%q", level, umin, umax) db.logf("table@compaction range L%d %q:%q", level, umin, umax)
if level >= 0 { if level >= 0 {
if c := db.s.getCompactionRange(level, umin, umax); c != nil { if c := db.s.getCompactionRange(level, umin, umax, true); c != nil {
db.tableCompaction(c, true) db.tableCompaction(c, true)
} }
} else { } else {
// Retry until nothing to compact.
for {
compacted := false
// Scan for maximum level with overlapped tables.
v := db.s.version() v := db.s.version()
m := 1 m := 1
for i, t := range v.tables[1:] { for i := m; i < len(v.levels); i++ {
if t.overlaps(db.s.icmp, umin, umax, false) { tables := v.levels[i]
m = i + 1 if tables.overlaps(db.s.icmp, umin, umax, false) {
m = i
} }
} }
v.release() v.release()
for level := 0; level < m; level++ { for level := 0; level < m; level++ {
if c := db.s.getCompactionRange(level, umin, umax); c != nil { if c := db.s.getCompactionRange(level, umin, umax, false); c != nil {
db.tableCompaction(c, true) db.tableCompaction(c, true)
compacted = true
}
}
if !compacted {
break
} }
} }
} }
return nil
} }
func (db *DB) tableAutoCompaction() { func (db *DB) tableAutoCompaction() {
@ -616,11 +652,11 @@ type cCmd interface {
ack(err error) ack(err error)
} }
type cIdle struct { type cAuto struct {
ackC chan<- error ackC chan<- error
} }
func (r cIdle) ack(err error) { func (r cAuto) ack(err error) {
if r.ackC != nil { if r.ackC != nil {
defer func() { defer func() {
recover() recover()
@ -644,13 +680,21 @@ func (r cRange) ack(err error) {
} }
} }
// This will trigger auto compation and/or wait for all compaction to be done. // This will trigger auto compaction but will not wait for it.
func (db *DB) compSendIdle(compC chan<- cCmd) (err error) { func (db *DB) compTrigger(compC chan<- cCmd) {
select {
case compC <- cAuto{}:
default:
}
}
// This will trigger auto compaction and/or wait for all compaction to be done.
func (db *DB) compTriggerWait(compC chan<- cCmd) (err error) {
ch := make(chan error) ch := make(chan error)
defer close(ch) defer close(ch)
// Send cmd. // Send cmd.
select { select {
case compC <- cIdle{ch}: case compC <- cAuto{ch}:
case err = <-db.compErrC: case err = <-db.compErrC:
return return
case _, _ = <-db.closeC: case _, _ = <-db.closeC:
@ -666,16 +710,8 @@ func (db *DB) compSendIdle(compC chan<- cCmd) (err error) {
return err return err
} }
// This will trigger auto compaction but will not wait for it.
func (db *DB) compSendTrigger(compC chan<- cCmd) {
select {
case compC <- cIdle{}:
default:
}
}
// Send range compaction request. // Send range compaction request.
func (db *DB) compSendRange(compC chan<- cCmd, level int, min, max []byte) (err error) { func (db *DB) compTriggerRange(compC chan<- cCmd, level int, min, max []byte) (err error) {
ch := make(chan error) ch := make(chan error)
defer close(ch) defer close(ch)
// Send cmd. // Send cmd.
@ -715,7 +751,7 @@ func (db *DB) mCompaction() {
select { select {
case x = <-db.mcompCmdC: case x = <-db.mcompCmdC:
switch x.(type) { switch x.(type) {
case cIdle: case cAuto:
db.memCompaction() db.memCompaction()
x.ack(nil) x.ack(nil)
x = nil x = nil
@ -776,11 +812,10 @@ func (db *DB) tCompaction() {
} }
if x != nil { if x != nil {
switch cmd := x.(type) { switch cmd := x.(type) {
case cIdle: case cAuto:
ackQ = append(ackQ, x) ackQ = append(ackQ, x)
case cRange: case cRange:
db.tableRangeCompaction(cmd.level, cmd.min, cmd.max) x.ack(db.tableRangeCompaction(cmd.level, cmd.min, cmd.max))
x.ack(nil)
default: default:
panic("leveldb: unknown command") panic("leveldb: unknown command")
} }

View file

@ -19,7 +19,7 @@ import (
) )
var ( var (
errInvalidIkey = errors.New("leveldb: Iterator: invalid internal key") errInvalidInternalKey = errors.New("leveldb: Iterator: invalid internal key")
) )
type memdbReleaser struct { type memdbReleaser struct {
@ -33,40 +33,50 @@ func (mr *memdbReleaser) Release() {
}) })
} }
func (db *DB) newRawIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { func (db *DB) newRawIterator(auxm *memDB, auxt tFiles, slice *util.Range, ro *opt.ReadOptions) iterator.Iterator {
strict := opt.GetStrict(db.s.o.Options, ro, opt.StrictReader)
em, fm := db.getMems() em, fm := db.getMems()
v := db.s.version() v := db.s.version()
ti := v.getIterators(slice, ro) tableIts := v.getIterators(slice, ro)
n := len(ti) + 2 n := len(tableIts) + len(auxt) + 3
i := make([]iterator.Iterator, 0, n) its := make([]iterator.Iterator, 0, n)
if auxm != nil {
ami := auxm.NewIterator(slice)
ami.SetReleaser(&memdbReleaser{m: auxm})
its = append(its, ami)
}
for _, t := range auxt {
its = append(its, v.s.tops.newIterator(t, slice, ro))
}
emi := em.NewIterator(slice) emi := em.NewIterator(slice)
emi.SetReleaser(&memdbReleaser{m: em}) emi.SetReleaser(&memdbReleaser{m: em})
i = append(i, emi) its = append(its, emi)
if fm != nil { if fm != nil {
fmi := fm.NewIterator(slice) fmi := fm.NewIterator(slice)
fmi.SetReleaser(&memdbReleaser{m: fm}) fmi.SetReleaser(&memdbReleaser{m: fm})
i = append(i, fmi) its = append(its, fmi)
} }
i = append(i, ti...) its = append(its, tableIts...)
strict := opt.GetStrict(db.s.o.Options, ro, opt.StrictReader) mi := iterator.NewMergedIterator(its, db.s.icmp, strict)
mi := iterator.NewMergedIterator(i, db.s.icmp, strict)
mi.SetReleaser(&versionReleaser{v: v}) mi.SetReleaser(&versionReleaser{v: v})
return mi return mi
} }
func (db *DB) newIterator(seq uint64, slice *util.Range, ro *opt.ReadOptions) *dbIter { func (db *DB) newIterator(auxm *memDB, auxt tFiles, seq uint64, slice *util.Range, ro *opt.ReadOptions) *dbIter {
var islice *util.Range var islice *util.Range
if slice != nil { if slice != nil {
islice = &util.Range{} islice = &util.Range{}
if slice.Start != nil { if slice.Start != nil {
islice.Start = newIkey(slice.Start, kMaxSeq, ktSeek) islice.Start = makeInternalKey(nil, slice.Start, keyMaxSeq, keyTypeSeek)
} }
if slice.Limit != nil { if slice.Limit != nil {
islice.Limit = newIkey(slice.Limit, kMaxSeq, ktSeek) islice.Limit = makeInternalKey(nil, slice.Limit, keyMaxSeq, keyTypeSeek)
} }
} }
rawIter := db.newRawIterator(islice, ro) rawIter := db.newRawIterator(auxm, auxt, islice, ro)
iter := &dbIter{ iter := &dbIter{
db: db, db: db,
icmp: db.s.icmp, icmp: db.s.icmp,
@ -177,7 +187,7 @@ func (i *dbIter) Seek(key []byte) bool {
return false return false
} }
ikey := newIkey(key, i.seq, ktSeek) ikey := makeInternalKey(nil, key, i.seq, keyTypeSeek)
if i.iter.Seek(ikey) { if i.iter.Seek(ikey) {
i.dir = dirSOI i.dir = dirSOI
return i.next() return i.next()
@ -189,15 +199,15 @@ func (i *dbIter) Seek(key []byte) bool {
func (i *dbIter) next() bool { func (i *dbIter) next() bool {
for { for {
if ukey, seq, kt, kerr := parseIkey(i.iter.Key()); kerr == nil { if ukey, seq, kt, kerr := parseInternalKey(i.iter.Key()); kerr == nil {
i.sampleSeek() i.sampleSeek()
if seq <= i.seq { if seq <= i.seq {
switch kt { switch kt {
case ktDel: case keyTypeDel:
// Skip deleted key. // Skip deleted key.
i.key = append(i.key[:0], ukey...) i.key = append(i.key[:0], ukey...)
i.dir = dirForward i.dir = dirForward
case ktVal: case keyTypeVal:
if i.dir == dirSOI || i.icmp.uCompare(ukey, i.key) > 0 { if i.dir == dirSOI || i.icmp.uCompare(ukey, i.key) > 0 {
i.key = append(i.key[:0], ukey...) i.key = append(i.key[:0], ukey...)
i.value = append(i.value[:0], i.iter.Value()...) i.value = append(i.value[:0], i.iter.Value()...)
@ -240,13 +250,13 @@ func (i *dbIter) prev() bool {
del := true del := true
if i.iter.Valid() { if i.iter.Valid() {
for { for {
if ukey, seq, kt, kerr := parseIkey(i.iter.Key()); kerr == nil { if ukey, seq, kt, kerr := parseInternalKey(i.iter.Key()); kerr == nil {
i.sampleSeek() i.sampleSeek()
if seq <= i.seq { if seq <= i.seq {
if !del && i.icmp.uCompare(ukey, i.key) < 0 { if !del && i.icmp.uCompare(ukey, i.key) < 0 {
return true return true
} }
del = (kt == ktDel) del = (kt == keyTypeDel)
if !del { if !del {
i.key = append(i.key[:0], ukey...) i.key = append(i.key[:0], ukey...)
i.value = append(i.value[:0], i.iter.Value()...) i.value = append(i.value[:0], i.iter.Value()...)
@ -282,7 +292,7 @@ func (i *dbIter) Prev() bool {
return i.Last() return i.Last()
case dirForward: case dirForward:
for i.iter.Prev() { for i.iter.Prev() {
if ukey, _, _, kerr := parseIkey(i.iter.Key()); kerr == nil { if ukey, _, _, kerr := parseInternalKey(i.iter.Key()); kerr == nil {
i.sampleSeek() i.sampleSeek()
if i.icmp.uCompare(ukey, i.key) < 0 { if i.icmp.uCompare(ukey, i.key) < 0 {
goto cont goto cont

View file

@ -110,7 +110,7 @@ func (snap *Snapshot) Get(key []byte, ro *opt.ReadOptions) (value []byte, err er
err = ErrSnapshotReleased err = ErrSnapshotReleased
return return
} }
return snap.db.get(key, snap.elem.seq, ro) return snap.db.get(nil, nil, key, snap.elem.seq, ro)
} }
// Has returns true if the DB does contains the given key. // Has returns true if the DB does contains the given key.
@ -127,10 +127,10 @@ func (snap *Snapshot) Has(key []byte, ro *opt.ReadOptions) (ret bool, err error)
err = ErrSnapshotReleased err = ErrSnapshotReleased
return return
} }
return snap.db.has(key, snap.elem.seq, ro) return snap.db.has(nil, nil, key, snap.elem.seq, ro)
} }
// NewIterator returns an iterator for the snapshot of the uderlying DB. // NewIterator returns an iterator for the snapshot of the underlying DB.
// The returned iterator is not goroutine-safe, but it is safe to use // The returned iterator is not goroutine-safe, but it is safe to use
// multiple iterators concurrently, with each in a dedicated goroutine. // multiple iterators concurrently, with each in a dedicated goroutine.
// It is also safe to use an iterator concurrently with modifying its // It is also safe to use an iterator concurrently with modifying its
@ -158,7 +158,7 @@ func (snap *Snapshot) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterat
} }
// Since iterator already hold version ref, it doesn't need to // Since iterator already hold version ref, it doesn't need to
// hold snapshot ref. // hold snapshot ref.
return snap.db.newIterator(snap.elem.seq, slice, ro) return snap.db.newIterator(nil, nil, snap.elem.seq, slice, ro)
} }
// Release releases the snapshot. This will not release any returned // Release releases the snapshot. This will not release any returned

View file

@ -12,6 +12,7 @@ import (
"github.com/syndtr/goleveldb/leveldb/journal" "github.com/syndtr/goleveldb/leveldb/journal"
"github.com/syndtr/goleveldb/leveldb/memdb" "github.com/syndtr/goleveldb/leveldb/memdb"
"github.com/syndtr/goleveldb/leveldb/storage"
) )
type memDB struct { type memDB struct {
@ -20,6 +21,10 @@ type memDB struct {
ref int32 ref int32
} }
func (m *memDB) getref() int32 {
return atomic.LoadInt32(&m.ref)
}
func (m *memDB) incref() { func (m *memDB) incref() {
atomic.AddInt32(&m.ref, 1) atomic.AddInt32(&m.ref, 1)
} }
@ -48,11 +53,15 @@ func (db *DB) addSeq(delta uint64) {
atomic.AddUint64(&db.seq, delta) atomic.AddUint64(&db.seq, delta)
} }
func (db *DB) sampleSeek(ikey iKey) { func (db *DB) setSeq(seq uint64) {
atomic.StoreUint64(&db.seq, seq)
}
func (db *DB) sampleSeek(ikey internalKey) {
v := db.s.version() v := db.s.version()
if v.sampleSeek(ikey) { if v.sampleSeek(ikey) {
// Trigger table compaction. // Trigger table compaction.
db.compSendTrigger(db.tcompCmdC) db.compTrigger(db.tcompCmdC)
} }
v.release() v.release()
} }
@ -67,12 +76,18 @@ func (db *DB) mpoolPut(mem *memdb.DB) {
} }
} }
func (db *DB) mpoolGet() *memdb.DB { func (db *DB) mpoolGet(n int) *memDB {
var mdb *memdb.DB
select { select {
case mem := <-db.memPool: case mdb = <-db.memPool:
return mem
default: default:
return nil }
if mdb == nil || mdb.Capacity() < n {
mdb = memdb.New(db.s.icmp, maxInt(db.s.o.GetWriteBuffer(), n))
}
return &memDB{
db: db,
DB: mdb,
} }
} }
@ -95,11 +110,10 @@ func (db *DB) mpoolDrain() {
// Create new memdb and froze the old one; need external synchronization. // Create new memdb and froze the old one; need external synchronization.
// newMem only called synchronously by the writer. // newMem only called synchronously by the writer.
func (db *DB) newMem(n int) (mem *memDB, err error) { func (db *DB) newMem(n int) (mem *memDB, err error) {
num := db.s.allocFileNum() fd := storage.FileDesc{Type: storage.TypeJournal, Num: db.s.allocFileNum()}
file := db.s.getJournalFile(num) w, err := db.s.stor.Create(fd)
w, err := file.Create()
if err != nil { if err != nil {
db.s.reuseFileNum(num) db.s.reuseFileNum(fd.Num)
return return
} }
@ -115,20 +129,14 @@ func (db *DB) newMem(n int) (mem *memDB, err error) {
} else { } else {
db.journal.Reset(w) db.journal.Reset(w)
db.journalWriter.Close() db.journalWriter.Close()
db.frozenJournalFile = db.journalFile db.frozenJournalFd = db.journalFd
} }
db.journalWriter = w db.journalWriter = w
db.journalFile = file db.journalFd = fd
db.frozenMem = db.mem db.frozenMem = db.mem
mdb := db.mpoolGet() mem = db.mpoolGet(n)
if mdb == nil || mdb.Capacity() < n { mem.incref() // for self
mdb = memdb.New(db.s.icmp, maxInt(db.s.o.GetWriteBuffer(), n)) mem.incref() // for caller
}
mem = &memDB{
db: db,
DB: mdb,
ref: 2,
}
db.mem = mem db.mem = mem
// The seq only incremented by the writer. And whoever called newMem // The seq only incremented by the writer. And whoever called newMem
// should hold write lock, so no need additional synchronization here. // should hold write lock, so no need additional synchronization here.
@ -181,12 +189,12 @@ func (db *DB) getFrozenMem() *memDB {
// Drop frozen memdb; assume that frozen memdb isn't nil. // Drop frozen memdb; assume that frozen memdb isn't nil.
func (db *DB) dropFrozenMem() { func (db *DB) dropFrozenMem() {
db.memMu.Lock() db.memMu.Lock()
if err := db.frozenJournalFile.Remove(); err != nil { if err := db.s.stor.Remove(db.frozenJournalFd); err != nil {
db.logf("journal@remove removing @%d %q", db.frozenJournalFile.Num(), err) db.logf("journal@remove removing @%d %q", db.frozenJournalFd.Num, err)
} else { } else {
db.logf("journal@remove removed @%d", db.frozenJournalFile.Num()) db.logf("journal@remove removed @%d", db.frozenJournalFd.Num)
} }
db.frozenJournalFile = nil db.frozenJournalFd = storage.FileDesc{}
db.frozenMem.decref() db.frozenMem.decref()
db.frozenMem = nil db.frozenMem = nil
db.memMu.Unlock() db.memMu.Unlock()

View file

@ -0,0 +1,289 @@
// Copyright (c) 2016, Suryandaru Triandana <syndtr@gmail.com>
// All rights reserved.
//
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package leveldb
import (
"errors"
"sync"
"time"
"github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/opt"
"github.com/syndtr/goleveldb/leveldb/util"
)
var errTransactionDone = errors.New("leveldb: transaction already closed")
// Transaction is the transaction handle.
type Transaction struct {
db *DB
lk sync.RWMutex
seq uint64
mem *memDB
tables tFiles
ikScratch []byte
rec sessionRecord
stats cStatStaging
closed bool
}
// Get gets the value for the given key. It returns ErrNotFound if the
// DB does not contains the key.
//
// The returned slice is its own copy, it is safe to modify the contents
// of the returned slice.
// It is safe to modify the contents of the argument after Get returns.
func (tr *Transaction) Get(key []byte, ro *opt.ReadOptions) ([]byte, error) {
tr.lk.RLock()
defer tr.lk.RUnlock()
if tr.closed {
return nil, errTransactionDone
}
return tr.db.get(tr.mem.DB, tr.tables, key, tr.seq, ro)
}
// Has returns true if the DB does contains the given key.
//
// It is safe to modify the contents of the argument after Has returns.
func (tr *Transaction) Has(key []byte, ro *opt.ReadOptions) (bool, error) {
tr.lk.RLock()
defer tr.lk.RUnlock()
if tr.closed {
return false, errTransactionDone
}
return tr.db.has(tr.mem.DB, tr.tables, key, tr.seq, ro)
}
// NewIterator returns an iterator for the latest snapshot of the transaction.
// The returned iterator is not goroutine-safe, but it is safe to use multiple
// iterators concurrently, with each in a dedicated goroutine.
// It is also safe to use an iterator concurrently while writes to the
// transaction. The resultant key/value pairs are guaranteed to be consistent.
//
// Slice allows slicing the iterator to only contains keys in the given
// range. A nil Range.Start is treated as a key before all keys in the
// DB. And a nil Range.Limit is treated as a key after all keys in
// the DB.
//
// The iterator must be released after use, by calling Release method.
//
// Also read Iterator documentation of the leveldb/iterator package.
func (tr *Transaction) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator {
tr.lk.RLock()
defer tr.lk.RUnlock()
if tr.closed {
return iterator.NewEmptyIterator(errTransactionDone)
}
tr.mem.incref()
return tr.db.newIterator(tr.mem, tr.tables, tr.seq, slice, ro)
}
func (tr *Transaction) flush() error {
// Flush memdb.
if tr.mem.Len() != 0 {
tr.stats.startTimer()
iter := tr.mem.NewIterator(nil)
t, n, err := tr.db.s.tops.createFrom(iter)
iter.Release()
tr.stats.stopTimer()
if err != nil {
return err
}
if tr.mem.getref() == 1 {
tr.mem.Reset()
} else {
tr.mem.decref()
tr.mem = tr.db.mpoolGet(0)
tr.mem.incref()
}
tr.tables = append(tr.tables, t)
tr.rec.addTableFile(0, t)
tr.stats.write += t.size
tr.db.logf("transaction@flush created L0@%d N·%d S·%s %q:%q", t.fd.Num, n, shortenb(int(t.size)), t.imin, t.imax)
}
return nil
}
func (tr *Transaction) put(kt keyType, key, value []byte) error {
tr.ikScratch = makeInternalKey(tr.ikScratch, key, tr.seq+1, kt)
if tr.mem.Free() < len(tr.ikScratch)+len(value) {
if err := tr.flush(); err != nil {
return err
}
}
if err := tr.mem.Put(tr.ikScratch, value); err != nil {
return err
}
tr.seq++
return nil
}
// Put sets the value for the given key. It overwrites any previous value
// for that key; a DB is not a multi-map.
// Please note that the transaction is not compacted until committed, so if you
// writes 10 same keys, then those 10 same keys are in the transaction.
//
// It is safe to modify the contents of the arguments after Put returns.
func (tr *Transaction) Put(key, value []byte, wo *opt.WriteOptions) error {
tr.lk.Lock()
defer tr.lk.Unlock()
if tr.closed {
return errTransactionDone
}
return tr.put(keyTypeVal, key, value)
}
// Delete deletes the value for the given key.
// Please note that the transaction is not compacted until committed, so if you
// writes 10 same keys, then those 10 same keys are in the transaction.
//
// It is safe to modify the contents of the arguments after Delete returns.
func (tr *Transaction) Delete(key []byte, wo *opt.WriteOptions) error {
tr.lk.Lock()
defer tr.lk.Unlock()
if tr.closed {
return errTransactionDone
}
return tr.put(keyTypeDel, key, nil)
}
// Write apply the given batch to the transaction. The batch will be applied
// sequentially.
// Please note that the transaction is not compacted until committed, so if you
// writes 10 same keys, then those 10 same keys are in the transaction.
//
// It is safe to modify the contents of the arguments after Write returns.
func (tr *Transaction) Write(b *Batch, wo *opt.WriteOptions) error {
if b == nil || b.Len() == 0 {
return nil
}
tr.lk.Lock()
defer tr.lk.Unlock()
if tr.closed {
return errTransactionDone
}
return b.decodeRec(func(i int, kt keyType, key, value []byte) error {
return tr.put(kt, key, value)
})
}
func (tr *Transaction) setDone() {
tr.closed = true
tr.db.tr = nil
tr.mem.decref()
<-tr.db.writeLockC
}
// Commit commits the transaction.
//
// Other methods should not be called after transaction has been committed.
func (tr *Transaction) Commit() error {
if err := tr.db.ok(); err != nil {
return err
}
tr.lk.Lock()
defer tr.lk.Unlock()
if tr.closed {
return errTransactionDone
}
defer tr.setDone()
if err := tr.flush(); err != nil {
tr.discard()
return err
}
if len(tr.tables) != 0 {
// Committing transaction.
tr.rec.setSeqNum(tr.seq)
tr.db.compCommitLk.Lock()
defer tr.db.compCommitLk.Unlock()
for retry := 0; retry < 3; retry++ {
if err := tr.db.s.commit(&tr.rec); err != nil {
tr.db.logf("transaction@commit error R·%d %q", retry, err)
select {
case <-time.After(time.Second):
case _, _ = <-tr.db.closeC:
tr.db.logf("transaction@commit exiting")
return err
}
} else {
// Success. Set db.seq.
tr.db.setSeq(tr.seq)
break
}
}
// Trigger table auto-compaction.
tr.db.compTrigger(tr.db.tcompCmdC)
}
return nil
}
func (tr *Transaction) discard() {
// Discard transaction.
for _, t := range tr.tables {
tr.db.logf("transaction@discard @%d", t.fd.Num)
if err1 := tr.db.s.stor.Remove(t.fd); err1 == nil {
tr.db.s.reuseFileNum(t.fd.Num)
}
}
}
// Discard discards the transaction.
//
// Other methods should not be called after transaction has been discarded.
func (tr *Transaction) Discard() {
tr.lk.Lock()
if !tr.closed {
tr.discard()
tr.setDone()
}
tr.lk.Unlock()
}
// OpenTransaction opens an atomic DB transaction. Only one transaction can be
// opened at a time. Write will be blocked until the transaction is committed or
// discarded.
// The returned transaction handle is goroutine-safe.
//
// The transaction must be closed once done, either by committing or discarding
// the transaction.
// Closing the DB will discard open transaction.
func (db *DB) OpenTransaction() (*Transaction, error) {
if err := db.ok(); err != nil {
return nil, err
}
// The write happen synchronously.
select {
case db.writeLockC <- struct{}{}:
case err := <-db.compPerErrC:
return nil, err
case _, _ = <-db.closeC:
return nil, ErrClosed
}
if db.tr != nil {
panic("leveldb: has open transaction")
}
// Flush current memdb.
if db.mem != nil && db.mem.Len() != 0 {
if _, err := db.rotateMem(0, true); err != nil {
return nil, err
}
}
tr := &Transaction{
db: db,
seq: db.seq,
mem: db.mpoolGet(0),
}
tr.mem.incref()
db.tr = tr
return tr, nil
}

View file

@ -21,14 +21,16 @@ type Reader interface {
NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator
} }
type Sizes []uint64 // Sizes is list of size.
type Sizes []int64
// Sum returns sum of the sizes. // Sum returns sum of the sizes.
func (p Sizes) Sum() (n uint64) { func (sizes Sizes) Sum() int64 {
for _, s := range p { var sum int64
n += s for _, size := range sizes {
sum += size
} }
return n return sum
} }
// Logging. // Logging.
@ -40,59 +42,59 @@ func (db *DB) checkAndCleanFiles() error {
v := db.s.version() v := db.s.version()
defer v.release() defer v.release()
tablesMap := make(map[uint64]bool) tmap := make(map[int64]bool)
for _, tables := range v.tables { for _, tables := range v.levels {
for _, t := range tables { for _, t := range tables {
tablesMap[t.file.Num()] = false tmap[t.fd.Num] = false
} }
} }
files, err := db.s.getFiles(storage.TypeAll) fds, err := db.s.stor.List(storage.TypeAll)
if err != nil { if err != nil {
return err return err
} }
var nTables int var nt int
var rem []storage.File var rem []storage.FileDesc
for _, f := range files { for _, fd := range fds {
keep := true keep := true
switch f.Type() { switch fd.Type {
case storage.TypeManifest: case storage.TypeManifest:
keep = f.Num() >= db.s.manifestFile.Num() keep = fd.Num >= db.s.manifestFd.Num
case storage.TypeJournal: case storage.TypeJournal:
if db.frozenJournalFile != nil { if !db.frozenJournalFd.Nil() {
keep = f.Num() >= db.frozenJournalFile.Num() keep = fd.Num >= db.frozenJournalFd.Num
} else { } else {
keep = f.Num() >= db.journalFile.Num() keep = fd.Num >= db.journalFd.Num
} }
case storage.TypeTable: case storage.TypeTable:
_, keep = tablesMap[f.Num()] _, keep = tmap[fd.Num]
if keep { if keep {
tablesMap[f.Num()] = true tmap[fd.Num] = true
nTables++ nt++
} }
} }
if !keep { if !keep {
rem = append(rem, f) rem = append(rem, fd)
} }
} }
if nTables != len(tablesMap) { if nt != len(tmap) {
var missing []*storage.FileInfo var mfds []storage.FileDesc
for num, present := range tablesMap { for num, present := range tmap {
if !present { if !present {
missing = append(missing, &storage.FileInfo{Type: storage.TypeTable, Num: num}) mfds = append(mfds, storage.FileDesc{storage.TypeTable, num})
db.logf("db@janitor table missing @%d", num) db.logf("db@janitor table missing @%d", num)
} }
} }
return errors.NewErrCorrupted(nil, &errors.ErrMissingFiles{Files: missing}) return errors.NewErrCorrupted(storage.FileDesc{}, &errors.ErrMissingFiles{Fds: mfds})
} }
db.logf("db@janitor F·%d G·%d", len(files), len(rem)) db.logf("db@janitor F·%d G·%d", len(fds), len(rem))
for _, f := range rem { for _, fd := range rem {
db.logf("db@janitor removing %s-%d", f.Type(), f.Num()) db.logf("db@janitor removing %s-%d", fd.Type, fd.Num)
if err := f.Remove(); err != nil { if err := db.s.stor.Remove(fd); err != nil {
return err return err
} }
} }

View file

@ -45,9 +45,9 @@ func (db *DB) jWriter() {
} }
} }
func (db *DB) rotateMem(n int) (mem *memDB, err error) { func (db *DB) rotateMem(n int, wait bool) (mem *memDB, err error) {
// Wait for pending memdb compaction. // Wait for pending memdb compaction.
err = db.compSendIdle(db.mcompCmdC) err = db.compTriggerWait(db.mcompCmdC)
if err != nil { if err != nil {
return return
} }
@ -59,7 +59,11 @@ func (db *DB) rotateMem(n int) (mem *memDB, err error) {
} }
// Schedule memdb compaction. // Schedule memdb compaction.
db.compSendTrigger(db.mcompCmdC) if wait {
err = db.compTriggerWait(db.mcompCmdC)
} else {
db.compTrigger(db.mcompCmdC)
}
return return
} }
@ -84,7 +88,7 @@ func (db *DB) flush(n int) (mdb *memDB, mdbFree int, err error) {
return false return false
case v.tLen(0) >= db.s.o.GetWriteL0PauseTrigger(): case v.tLen(0) >= db.s.o.GetWriteL0PauseTrigger():
delayed = true delayed = true
err = db.compSendIdle(db.tcompCmdC) err = db.compTriggerWait(db.tcompCmdC)
if err != nil { if err != nil {
return false return false
} }
@ -94,7 +98,7 @@ func (db *DB) flush(n int) (mdb *memDB, mdbFree int, err error) {
mdbFree = n mdbFree = n
} else { } else {
mdb.decref() mdb.decref()
mdb, err = db.rotateMem(n) mdb, err = db.rotateMem(n, false)
if err == nil { if err == nil {
mdbFree = mdb.Free() mdbFree = mdb.Free()
} else { } else {
@ -131,12 +135,27 @@ func (db *DB) Write(b *Batch, wo *opt.WriteOptions) (err error) {
b.init(wo.GetSync() && !db.s.o.GetNoSync()) b.init(wo.GetSync() && !db.s.o.GetNoSync())
if b.size() > db.s.o.GetWriteBuffer() && !db.s.o.GetDisableLargeBatchTransaction() {
// Writes using transaction.
tr, err1 := db.OpenTransaction()
if err1 != nil {
return err1
}
if err1 := tr.Write(b, wo); err1 != nil {
tr.Discard()
return err1
}
return tr.Commit()
}
// The write happen synchronously. // The write happen synchronously.
select { select {
case db.writeC <- b: case db.writeC <- b:
if <-db.writeMergedC { if <-db.writeMergedC {
return <-db.writeAckC return <-db.writeAckC
} }
// Continue, the write lock already acquired by previous writer
// and handed out to us.
case db.writeLockC <- struct{}{}: case db.writeLockC <- struct{}{}:
case err = <-db.compPerErrC: case err = <-db.compPerErrC:
return return
@ -147,14 +166,15 @@ func (db *DB) Write(b *Batch, wo *opt.WriteOptions) (err error) {
merged := 0 merged := 0
danglingMerge := false danglingMerge := false
defer func() { defer func() {
for i := 0; i < merged; i++ {
db.writeAckC <- err
}
if danglingMerge { if danglingMerge {
// Only one dangling merge at most, so this is safe.
db.writeMergedC <- false db.writeMergedC <- false
} else { } else {
<-db.writeLockC <-db.writeLockC
} }
for i := 0; i < merged; i++ {
db.writeAckC <- err
}
}() }()
mdb, mdbFree, err := db.flush(b.size()) mdb, mdbFree, err := db.flush(b.size())
@ -234,7 +254,7 @@ drain:
db.addSeq(uint64(b.Len())) db.addSeq(uint64(b.Len()))
if b.size() >= mdbFree { if b.size() >= mdbFree {
db.rotateMem(0) db.rotateMem(0, false)
} }
return return
} }
@ -261,8 +281,8 @@ func (db *DB) Delete(key []byte, wo *opt.WriteOptions) error {
func isMemOverlaps(icmp *iComparer, mem *memdb.DB, min, max []byte) bool { func isMemOverlaps(icmp *iComparer, mem *memdb.DB, min, max []byte) bool {
iter := mem.NewIterator(nil) iter := mem.NewIterator(nil)
defer iter.Release() defer iter.Release()
return (max == nil || (iter.First() && icmp.uCompare(max, iKey(iter.Key()).ukey()) >= 0)) && return (max == nil || (iter.First() && icmp.uCompare(max, internalKey(iter.Key()).ukey()) >= 0)) &&
(min == nil || (iter.Last() && icmp.uCompare(min, iKey(iter.Key()).ukey()) <= 0)) (min == nil || (iter.Last() && icmp.uCompare(min, internalKey(iter.Key()).ukey()) <= 0))
} }
// CompactRange compacts the underlying DB for the given key range. // CompactRange compacts the underlying DB for the given key range.
@ -293,12 +313,12 @@ func (db *DB) CompactRange(r util.Range) error {
defer mdb.decref() defer mdb.decref()
if isMemOverlaps(db.s.icmp, mdb.DB, r.Start, r.Limit) { if isMemOverlaps(db.s.icmp, mdb.DB, r.Start, r.Limit) {
// Memdb compaction. // Memdb compaction.
if _, err := db.rotateMem(0); err != nil { if _, err := db.rotateMem(0, false); err != nil {
<-db.writeLockC <-db.writeLockC
return err return err
} }
<-db.writeLockC <-db.writeLockC
if err := db.compSendIdle(db.mcompCmdC); err != nil { if err := db.compTriggerWait(db.mcompCmdC); err != nil {
return err return err
} }
} else { } else {
@ -306,7 +326,7 @@ func (db *DB) CompactRange(r util.Range) error {
} }
// Table compaction. // Table compaction.
return db.compSendRange(db.tcompCmdC, -1, r.Start, r.Limit) return db.compTriggerRange(db.tcompCmdC, -1, r.Start, r.Limit)
} }
// SetReadOnly makes DB read-only. It will stay read-only until reopened. // SetReadOnly makes DB read-only. It will stay read-only until reopened.

View file

@ -29,21 +29,21 @@ func New(text string) error {
// ErrCorrupted is the type that wraps errors that indicate corruption in // ErrCorrupted is the type that wraps errors that indicate corruption in
// the database. // the database.
type ErrCorrupted struct { type ErrCorrupted struct {
File *storage.FileInfo Fd storage.FileDesc
Err error Err error
} }
func (e *ErrCorrupted) Error() string { func (e *ErrCorrupted) Error() string {
if e.File != nil { if !e.Fd.Nil() {
return fmt.Sprintf("%v [file=%v]", e.Err, e.File) return fmt.Sprintf("%v [file=%v]", e.Err, e.Fd)
} else { } else {
return e.Err.Error() return e.Err.Error()
} }
} }
// NewErrCorrupted creates new ErrCorrupted error. // NewErrCorrupted creates new ErrCorrupted error.
func NewErrCorrupted(f storage.File, err error) error { func NewErrCorrupted(fd storage.FileDesc, err error) error {
return &ErrCorrupted{storage.NewFileInfo(f), err} return &ErrCorrupted{fd, err}
} }
// IsCorrupted returns a boolean indicating whether the error is indicating // IsCorrupted returns a boolean indicating whether the error is indicating
@ -61,17 +61,17 @@ func IsCorrupted(err error) bool {
// ErrMissingFiles is the type that indicating a corruption due to missing // ErrMissingFiles is the type that indicating a corruption due to missing
// files. ErrMissingFiles always wrapped with ErrCorrupted. // files. ErrMissingFiles always wrapped with ErrCorrupted.
type ErrMissingFiles struct { type ErrMissingFiles struct {
Files []*storage.FileInfo Fds []storage.FileDesc
} }
func (e *ErrMissingFiles) Error() string { return "file missing" } func (e *ErrMissingFiles) Error() string { return "file missing" }
// SetFile sets 'file info' of the given error with the given file. // SetFd sets 'file info' of the given error with the given file.
// Currently only ErrCorrupted is supported, otherwise will do nothing. // Currently only ErrCorrupted is supported, otherwise will do nothing.
func SetFile(err error, f storage.File) error { func SetFd(err error, fd storage.FileDesc) error {
switch x := err.(type) { switch x := err.(type) {
case *ErrCorrupted: case *ErrCorrupted:
x.File = storage.NewFileInfo(f) x.Fd = fd
return x return x
} }
return err return err

View file

@ -15,7 +15,7 @@ type iFilter struct {
} }
func (f iFilter) Contains(filter, key []byte) bool { func (f iFilter) Contains(filter, key []byte) bool {
return f.Filter.Contains(filter, iKey(key).ukey()) return f.Filter.Contains(filter, internalKey(key).ukey())
} }
func (f iFilter) NewGenerator() filter.FilterGenerator { func (f iFilter) NewGenerator() filter.FilterGenerator {
@ -27,5 +27,5 @@ type iFilterGenerator struct {
} }
func (g iFilterGenerator) Add(key []byte) { func (g iFilterGenerator) Add(key []byte) {
g.FilterGenerator.Add(iKey(key).ukey()) g.FilterGenerator.Add(internalKey(key).ukey())
} }

View file

@ -83,6 +83,7 @@ import (
"io" "io"
"github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/storage"
"github.com/syndtr/goleveldb/leveldb/util" "github.com/syndtr/goleveldb/leveldb/util"
) )
@ -165,7 +166,7 @@ func (r *Reader) corrupt(n int, reason string, skip bool) error {
r.dropper.Drop(&ErrCorrupted{n, reason}) r.dropper.Drop(&ErrCorrupted{n, reason})
} }
if r.strict && !skip { if r.strict && !skip {
r.err = errors.NewErrCorrupted(nil, &ErrCorrupted{n, reason}) r.err = errors.NewErrCorrupted(storage.FileDesc{}, &ErrCorrupted{n, reason})
return r.err return r.err
} }
return errSkip return errSkip

View file

@ -11,28 +11,30 @@ import (
"fmt" "fmt"
"github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/storage"
) )
type ErrIkeyCorrupted struct { // ErrInternalKeyCorrupted records internal key corruption.
type ErrInternalKeyCorrupted struct {
Ikey []byte Ikey []byte
Reason string Reason string
} }
func (e *ErrIkeyCorrupted) Error() string { func (e *ErrInternalKeyCorrupted) Error() string {
return fmt.Sprintf("leveldb: iKey %q corrupted: %s", e.Ikey, e.Reason) return fmt.Sprintf("leveldb: internal key %q corrupted: %s", e.Ikey, e.Reason)
} }
func newErrIkeyCorrupted(ikey []byte, reason string) error { func newErrInternalKeyCorrupted(ikey []byte, reason string) error {
return errors.NewErrCorrupted(nil, &ErrIkeyCorrupted{append([]byte{}, ikey...), reason}) return errors.NewErrCorrupted(storage.FileDesc{}, &ErrInternalKeyCorrupted{append([]byte{}, ikey...), reason})
} }
type kType int type keyType uint
func (kt kType) String() string { func (kt keyType) String() string {
switch kt { switch kt {
case ktDel: case keyTypeDel:
return "d" return "d"
case ktVal: case keyTypeVal:
return "v" return "v"
} }
return "x" return "x"
@ -41,102 +43,105 @@ func (kt kType) String() string {
// Value types encoded as the last component of internal keys. // Value types encoded as the last component of internal keys.
// Don't modify; this value are saved to disk. // Don't modify; this value are saved to disk.
const ( const (
ktDel kType = iota keyTypeDel keyType = iota
ktVal keyTypeVal
) )
// ktSeek defines the kType that should be passed when constructing an // keyTypeSeek defines the keyType that should be passed when constructing an
// internal key for seeking to a particular sequence number (since we // internal key for seeking to a particular sequence number (since we
// sort sequence numbers in decreasing order and the value type is // sort sequence numbers in decreasing order and the value type is
// embedded as the low 8 bits in the sequence number in internal keys, // embedded as the low 8 bits in the sequence number in internal keys,
// we need to use the highest-numbered ValueType, not the lowest). // we need to use the highest-numbered ValueType, not the lowest).
const ktSeek = ktVal const keyTypeSeek = keyTypeVal
const ( const (
// Maximum value possible for sequence number; the 8-bits are // Maximum value possible for sequence number; the 8-bits are
// used by value type, so its can packed together in single // used by value type, so its can packed together in single
// 64-bit integer. // 64-bit integer.
kMaxSeq uint64 = (uint64(1) << 56) - 1 keyMaxSeq = (uint64(1) << 56) - 1
// Maximum value possible for packed sequence number and type. // Maximum value possible for packed sequence number and type.
kMaxNum uint64 = (kMaxSeq << 8) | uint64(ktSeek) keyMaxNum = (keyMaxSeq << 8) | uint64(keyTypeSeek)
) )
// Maximum number encoded in bytes. // Maximum number encoded in bytes.
var kMaxNumBytes = make([]byte, 8) var keyMaxNumBytes = make([]byte, 8)
func init() { func init() {
binary.LittleEndian.PutUint64(kMaxNumBytes, kMaxNum) binary.LittleEndian.PutUint64(keyMaxNumBytes, keyMaxNum)
} }
type iKey []byte type internalKey []byte
func newIkey(ukey []byte, seq uint64, kt kType) iKey { func makeInternalKey(dst, ukey []byte, seq uint64, kt keyType) internalKey {
if seq > kMaxSeq { if seq > keyMaxSeq {
panic("leveldb: invalid sequence number") panic("leveldb: invalid sequence number")
} else if kt > ktVal { } else if kt > keyTypeVal {
panic("leveldb: invalid type") panic("leveldb: invalid type")
} }
ik := make(iKey, len(ukey)+8) if n := len(ukey) + 8; cap(dst) < n {
copy(ik, ukey) dst = make([]byte, n)
binary.LittleEndian.PutUint64(ik[len(ukey):], (seq<<8)|uint64(kt)) } else {
return ik dst = dst[:n]
}
copy(dst, ukey)
binary.LittleEndian.PutUint64(dst[len(ukey):], (seq<<8)|uint64(kt))
return internalKey(dst)
} }
func parseIkey(ik []byte) (ukey []byte, seq uint64, kt kType, err error) { func parseInternalKey(ik []byte) (ukey []byte, seq uint64, kt keyType, err error) {
if len(ik) < 8 { if len(ik) < 8 {
return nil, 0, 0, newErrIkeyCorrupted(ik, "invalid length") return nil, 0, 0, newErrInternalKeyCorrupted(ik, "invalid length")
} }
num := binary.LittleEndian.Uint64(ik[len(ik)-8:]) num := binary.LittleEndian.Uint64(ik[len(ik)-8:])
seq, kt = uint64(num>>8), kType(num&0xff) seq, kt = uint64(num>>8), keyType(num&0xff)
if kt > ktVal { if kt > keyTypeVal {
return nil, 0, 0, newErrIkeyCorrupted(ik, "invalid type") return nil, 0, 0, newErrInternalKeyCorrupted(ik, "invalid type")
} }
ukey = ik[:len(ik)-8] ukey = ik[:len(ik)-8]
return return
} }
func validIkey(ik []byte) bool { func validInternalKey(ik []byte) bool {
_, _, _, err := parseIkey(ik) _, _, _, err := parseInternalKey(ik)
return err == nil return err == nil
} }
func (ik iKey) assert() { func (ik internalKey) assert() {
if ik == nil { if ik == nil {
panic("leveldb: nil iKey") panic("leveldb: nil internalKey")
} }
if len(ik) < 8 { if len(ik) < 8 {
panic(fmt.Sprintf("leveldb: iKey %q, len=%d: invalid length", []byte(ik), len(ik))) panic(fmt.Sprintf("leveldb: internal key %q, len=%d: invalid length", []byte(ik), len(ik)))
} }
} }
func (ik iKey) ukey() []byte { func (ik internalKey) ukey() []byte {
ik.assert() ik.assert()
return ik[:len(ik)-8] return ik[:len(ik)-8]
} }
func (ik iKey) num() uint64 { func (ik internalKey) num() uint64 {
ik.assert() ik.assert()
return binary.LittleEndian.Uint64(ik[len(ik)-8:]) return binary.LittleEndian.Uint64(ik[len(ik)-8:])
} }
func (ik iKey) parseNum() (seq uint64, kt kType) { func (ik internalKey) parseNum() (seq uint64, kt keyType) {
num := ik.num() num := ik.num()
seq, kt = uint64(num>>8), kType(num&0xff) seq, kt = uint64(num>>8), keyType(num&0xff)
if kt > ktVal { if kt > keyTypeVal {
panic(fmt.Sprintf("leveldb: iKey %q, len=%d: invalid type %#x", []byte(ik), len(ik), kt)) panic(fmt.Sprintf("leveldb: internal key %q, len=%d: invalid type %#x", []byte(ik), len(ik), kt))
} }
return return
} }
func (ik iKey) String() string { func (ik internalKey) String() string {
if ik == nil { if ik == nil {
return "<nil>" return "<nil>"
} }
if ukey, seq, kt, err := parseIkey(ik); err == nil { if ukey, seq, kt, err := parseInternalKey(ik); err == nil {
return fmt.Sprintf("%s,%s%d", shorten(string(ukey)), kt, seq) return fmt.Sprintf("%s,%s%d", shorten(string(ukey)), kt, seq)
} else { }
return "<invalid>" return "<invalid>"
} }
}

View file

@ -8,10 +8,11 @@
package opt package opt
import ( import (
"math"
"github.com/syndtr/goleveldb/leveldb/cache" "github.com/syndtr/goleveldb/leveldb/cache"
"github.com/syndtr/goleveldb/leveldb/comparer" "github.com/syndtr/goleveldb/leveldb/comparer"
"github.com/syndtr/goleveldb/leveldb/filter" "github.com/syndtr/goleveldb/leveldb/filter"
"math"
) )
const ( const (
@ -35,8 +36,6 @@ var (
DefaultCompactionTotalSizeMultiplier = 10.0 DefaultCompactionTotalSizeMultiplier = 10.0
DefaultCompressionType = SnappyCompression DefaultCompressionType = SnappyCompression
DefaultIteratorSamplingRate = 1 * MiB DefaultIteratorSamplingRate = 1 * MiB
DefaultMaxMemCompationLevel = 2
DefaultNumLevel = 7
DefaultOpenFilesCacher = LRUCacher DefaultOpenFilesCacher = LRUCacher
DefaultOpenFilesCacheCapacity = 500 DefaultOpenFilesCacheCapacity = 500
DefaultWriteBuffer = 4 * MiB DefaultWriteBuffer = 4 * MiB
@ -266,6 +265,13 @@ type Options struct {
// The default value is false. // The default value is false.
DisableCompactionBackoff bool DisableCompactionBackoff bool
// DisableLargeBatchTransaction allows disabling switch-to-transaction mode
// on large batch write. If enable batch writes large than WriteBuffer will
// use transaction.
//
// The default is false.
DisableLargeBatchTransaction bool
// ErrorIfExist defines whether an error should returned if the DB already // ErrorIfExist defines whether an error should returned if the DB already
// exist. // exist.
// //
@ -301,24 +307,11 @@ type Options struct {
// The default is 1MiB. // The default is 1MiB.
IteratorSamplingRate int IteratorSamplingRate int
// MaxMemCompationLevel defines maximum level a newly compacted 'memdb'
// will be pushed into if doesn't creates overlap. This should less than
// NumLevel. Use -1 for level-0.
//
// The default is 2.
MaxMemCompationLevel int
// NoSync allows completely disable fsync. // NoSync allows completely disable fsync.
// //
// The default is false. // The default is false.
NoSync bool NoSync bool
// NumLevel defines number of database level. The level shouldn't changed
// between opens, or the database will panic.
//
// The default is 7.
NumLevel int
// OpenFilesCacher provides cache algorithm for open files caching. // OpenFilesCacher provides cache algorithm for open files caching.
// Specify NoCacher to disable caching algorithm. // Specify NoCacher to disable caching algorithm.
// //
@ -440,7 +433,7 @@ func (o *Options) GetCompactionTableSize(level int) int {
if o.CompactionTableSize > 0 { if o.CompactionTableSize > 0 {
base = o.CompactionTableSize base = o.CompactionTableSize
} }
if len(o.CompactionTableSizeMultiplierPerLevel) > level && o.CompactionTableSizeMultiplierPerLevel[level] > 0 { if level < len(o.CompactionTableSizeMultiplierPerLevel) && o.CompactionTableSizeMultiplierPerLevel[level] > 0 {
mult = o.CompactionTableSizeMultiplierPerLevel[level] mult = o.CompactionTableSizeMultiplierPerLevel[level]
} else if o.CompactionTableSizeMultiplier > 0 { } else if o.CompactionTableSizeMultiplier > 0 {
mult = math.Pow(o.CompactionTableSizeMultiplier, float64(level)) mult = math.Pow(o.CompactionTableSizeMultiplier, float64(level))
@ -461,7 +454,7 @@ func (o *Options) GetCompactionTotalSize(level int) int64 {
if o.CompactionTotalSize > 0 { if o.CompactionTotalSize > 0 {
base = o.CompactionTotalSize base = o.CompactionTotalSize
} }
if len(o.CompactionTotalSizeMultiplierPerLevel) > level && o.CompactionTotalSizeMultiplierPerLevel[level] > 0 { if level < len(o.CompactionTotalSizeMultiplierPerLevel) && o.CompactionTotalSizeMultiplierPerLevel[level] > 0 {
mult = o.CompactionTotalSizeMultiplierPerLevel[level] mult = o.CompactionTotalSizeMultiplierPerLevel[level]
} else if o.CompactionTotalSizeMultiplier > 0 { } else if o.CompactionTotalSizeMultiplier > 0 {
mult = math.Pow(o.CompactionTotalSizeMultiplier, float64(level)) mult = math.Pow(o.CompactionTotalSizeMultiplier, float64(level))
@ -508,6 +501,13 @@ func (o *Options) GetDisableCompactionBackoff() bool {
return o.DisableCompactionBackoff return o.DisableCompactionBackoff
} }
func (o *Options) GetDisableLargeBatchTransaction() bool {
if o == nil {
return false
}
return o.DisableLargeBatchTransaction
}
func (o *Options) GetErrorIfExist() bool { func (o *Options) GetErrorIfExist() bool {
if o == nil { if o == nil {
return false return false
@ -536,21 +536,6 @@ func (o *Options) GetIteratorSamplingRate() int {
return o.IteratorSamplingRate return o.IteratorSamplingRate
} }
func (o *Options) GetMaxMemCompationLevel() int {
level := DefaultMaxMemCompationLevel
if o != nil {
if o.MaxMemCompationLevel > 0 {
level = o.MaxMemCompationLevel
} else if o.MaxMemCompationLevel < 0 {
level = 0
}
}
if level >= o.GetNumLevel() {
return o.GetNumLevel() - 1
}
return level
}
func (o *Options) GetNoSync() bool { func (o *Options) GetNoSync() bool {
if o == nil { if o == nil {
return false return false
@ -558,13 +543,6 @@ func (o *Options) GetNoSync() bool {
return o.NoSync return o.NoSync
} }
func (o *Options) GetNumLevel() int {
if o == nil || o.NumLevel <= 0 {
return DefaultNumLevel
}
return o.NumLevel
}
func (o *Options) GetOpenFilesCacher() Cacher { func (o *Options) GetOpenFilesCacher() Cacher {
if o == nil || o.OpenFilesCacher == nil { if o == nil || o.OpenFilesCacher == nil {
return DefaultOpenFilesCacher return DefaultOpenFilesCacher

View file

@ -43,6 +43,8 @@ func (s *session) setOptions(o *opt.Options) {
s.o.cache() s.o.cache()
} }
const optCachedLevel = 7
type cachedOptions struct { type cachedOptions struct {
*opt.Options *opt.Options
@ -54,15 +56,13 @@ type cachedOptions struct {
} }
func (co *cachedOptions) cache() { func (co *cachedOptions) cache() {
numLevel := co.Options.GetNumLevel() co.compactionExpandLimit = make([]int, optCachedLevel)
co.compactionGPOverlaps = make([]int, optCachedLevel)
co.compactionSourceLimit = make([]int, optCachedLevel)
co.compactionTableSize = make([]int, optCachedLevel)
co.compactionTotalSize = make([]int64, optCachedLevel)
co.compactionExpandLimit = make([]int, numLevel) for level := 0; level < optCachedLevel; level++ {
co.compactionGPOverlaps = make([]int, numLevel)
co.compactionSourceLimit = make([]int, numLevel)
co.compactionTableSize = make([]int, numLevel)
co.compactionTotalSize = make([]int64, numLevel)
for level := 0; level < numLevel; level++ {
co.compactionExpandLimit[level] = co.Options.GetCompactionExpandLimit(level) co.compactionExpandLimit[level] = co.Options.GetCompactionExpandLimit(level)
co.compactionGPOverlaps[level] = co.Options.GetCompactionGPOverlaps(level) co.compactionGPOverlaps[level] = co.Options.GetCompactionGPOverlaps(level)
co.compactionSourceLimit[level] = co.Options.GetCompactionSourceLimit(level) co.compactionSourceLimit[level] = co.Options.GetCompactionSourceLimit(level)
@ -72,21 +72,36 @@ func (co *cachedOptions) cache() {
} }
func (co *cachedOptions) GetCompactionExpandLimit(level int) int { func (co *cachedOptions) GetCompactionExpandLimit(level int) int {
if level < optCachedLevel {
return co.compactionExpandLimit[level] return co.compactionExpandLimit[level]
} }
return co.Options.GetCompactionExpandLimit(level)
}
func (co *cachedOptions) GetCompactionGPOverlaps(level int) int { func (co *cachedOptions) GetCompactionGPOverlaps(level int) int {
if level < optCachedLevel {
return co.compactionGPOverlaps[level] return co.compactionGPOverlaps[level]
} }
return co.Options.GetCompactionGPOverlaps(level)
}
func (co *cachedOptions) GetCompactionSourceLimit(level int) int { func (co *cachedOptions) GetCompactionSourceLimit(level int) int {
if level < optCachedLevel {
return co.compactionSourceLimit[level] return co.compactionSourceLimit[level]
} }
return co.Options.GetCompactionSourceLimit(level)
}
func (co *cachedOptions) GetCompactionTableSize(level int) int { func (co *cachedOptions) GetCompactionTableSize(level int) int {
if level < optCachedLevel {
return co.compactionTableSize[level] return co.compactionTableSize[level]
} }
return co.Options.GetCompactionTableSize(level)
}
func (co *cachedOptions) GetCompactionTotalSize(level int) int64 { func (co *cachedOptions) GetCompactionTotalSize(level int) int64 {
if level < optCachedLevel {
return co.compactionTotalSize[level] return co.compactionTotalSize[level]
} }
return co.Options.GetCompactionTotalSize(level)
}

View file

@ -16,9 +16,9 @@ import (
"github.com/syndtr/goleveldb/leveldb/journal" "github.com/syndtr/goleveldb/leveldb/journal"
"github.com/syndtr/goleveldb/leveldb/opt" "github.com/syndtr/goleveldb/leveldb/opt"
"github.com/syndtr/goleveldb/leveldb/storage" "github.com/syndtr/goleveldb/leveldb/storage"
"github.com/syndtr/goleveldb/leveldb/util"
) )
// ErrManifestCorrupted records manifest corruption.
type ErrManifestCorrupted struct { type ErrManifestCorrupted struct {
Field string Field string
Reason string Reason string
@ -28,30 +28,30 @@ func (e *ErrManifestCorrupted) Error() string {
return fmt.Sprintf("leveldb: manifest corrupted (field '%s'): %s", e.Field, e.Reason) return fmt.Sprintf("leveldb: manifest corrupted (field '%s'): %s", e.Field, e.Reason)
} }
func newErrManifestCorrupted(f storage.File, field, reason string) error { func newErrManifestCorrupted(fd storage.FileDesc, field, reason string) error {
return errors.NewErrCorrupted(f, &ErrManifestCorrupted{field, reason}) return errors.NewErrCorrupted(fd, &ErrManifestCorrupted{field, reason})
} }
// session represent a persistent database session. // session represent a persistent database session.
type session struct { type session struct {
// Need 64-bit alignment. // Need 64-bit alignment.
stNextFileNum uint64 // current unused file number stNextFileNum int64 // current unused file number
stJournalNum uint64 // current journal file number; need external synchronization stJournalNum int64 // current journal file number; need external synchronization
stPrevJournalNum uint64 // prev journal file number; no longer used; for compatibility with older version of leveldb stPrevJournalNum int64 // prev journal file number; no longer used; for compatibility with older version of leveldb
stTempFileNum int64
stSeqNum uint64 // last mem compacted seq; need external synchronization stSeqNum uint64 // last mem compacted seq; need external synchronization
stTempFileNum uint64
stor storage.Storage stor storage.Storage
storLock util.Releaser storLock storage.Lock
o *cachedOptions o *cachedOptions
icmp *iComparer icmp *iComparer
tops *tOps tops *tOps
manifest *journal.Writer manifest *journal.Writer
manifestWriter storage.Writer manifestWriter storage.Writer
manifestFile storage.File manifestFd storage.FileDesc
stCompPtrs []iKey // compaction pointers; need external synchronization stCompPtrs []internalKey // compaction pointers; need external synchronization
stVersion *version // current version stVersion *version // current version
vmu sync.Mutex vmu sync.Mutex
} }
@ -68,7 +68,6 @@ func newSession(stor storage.Storage, o *opt.Options) (s *session, err error) {
s = &session{ s = &session{
stor: stor, stor: stor,
storLock: storLock, storLock: storLock,
stCompPtrs: make([]iKey, o.GetNumLevel()),
} }
s.setOptions(o) s.setOptions(o)
s.tops = newTableOps(s) s.tops = newTableOps(s)
@ -88,7 +87,6 @@ func (s *session) close() {
} }
s.manifest = nil s.manifest = nil
s.manifestWriter = nil s.manifestWriter = nil
s.manifestFile = nil
s.stVersion = nil s.stVersion = nil
} }
@ -109,18 +107,18 @@ func (s *session) recover() (err error) {
if os.IsNotExist(err) { if os.IsNotExist(err) {
// Don't return os.ErrNotExist if the underlying storage contains // Don't return os.ErrNotExist if the underlying storage contains
// other files that belong to LevelDB. So the DB won't get trashed. // other files that belong to LevelDB. So the DB won't get trashed.
if files, _ := s.stor.GetFiles(storage.TypeAll); len(files) > 0 { if fds, _ := s.stor.List(storage.TypeAll); len(fds) > 0 {
err = &errors.ErrCorrupted{File: &storage.FileInfo{Type: storage.TypeManifest}, Err: &errors.ErrMissingFiles{}} err = &errors.ErrCorrupted{Fd: storage.FileDesc{Type: storage.TypeManifest}, Err: &errors.ErrMissingFiles{}}
} }
} }
}() }()
m, err := s.stor.GetManifest() fd, err := s.stor.GetMeta()
if err != nil { if err != nil {
return return
} }
reader, err := m.Open() reader, err := s.stor.Open(fd)
if err != nil { if err != nil {
return return
} }
@ -128,10 +126,9 @@ func (s *session) recover() (err error) {
var ( var (
// Options. // Options.
numLevel = s.o.GetNumLevel()
strict = s.o.GetStrict(opt.StrictManifest) strict = s.o.GetStrict(opt.StrictManifest)
jr = journal.NewReader(reader, dropper{s, m}, strict, true) jr = journal.NewReader(reader, dropper{s, fd}, strict, true)
rec = &sessionRecord{} rec = &sessionRecord{}
staging = s.stVersion.newStaging() staging = s.stVersion.newStaging()
) )
@ -143,24 +140,23 @@ func (s *session) recover() (err error) {
err = nil err = nil
break break
} }
return errors.SetFile(err, m) return errors.SetFd(err, fd)
} }
err = rec.decode(r, numLevel) err = rec.decode(r)
if err == nil { if err == nil {
// save compact pointers // save compact pointers
for _, r := range rec.compPtrs { for _, r := range rec.compPtrs {
s.stCompPtrs[r.level] = iKey(r.ikey) s.setCompPtr(r.level, internalKey(r.ikey))
} }
// commit record to version staging // commit record to version staging
staging.commit(rec) staging.commit(rec)
} else { } else {
err = errors.SetFile(err, m) err = errors.SetFd(err, fd)
if strict || !errors.IsCorrupted(err) { if strict || !errors.IsCorrupted(err) {
return return
} else {
s.logf("manifest error: %v (skipped)", errors.SetFile(err, m))
} }
s.logf("manifest error: %v (skipped)", errors.SetFd(err, fd))
} }
rec.resetCompPtrs() rec.resetCompPtrs()
rec.resetAddedTables() rec.resetAddedTables()
@ -169,18 +165,18 @@ func (s *session) recover() (err error) {
switch { switch {
case !rec.has(recComparer): case !rec.has(recComparer):
return newErrManifestCorrupted(m, "comparer", "missing") return newErrManifestCorrupted(fd, "comparer", "missing")
case rec.comparer != s.icmp.uName(): case rec.comparer != s.icmp.uName():
return newErrManifestCorrupted(m, "comparer", fmt.Sprintf("mismatch: want '%s', got '%s'", s.icmp.uName(), rec.comparer)) return newErrManifestCorrupted(fd, "comparer", fmt.Sprintf("mismatch: want '%s', got '%s'", s.icmp.uName(), rec.comparer))
case !rec.has(recNextFileNum): case !rec.has(recNextFileNum):
return newErrManifestCorrupted(m, "next-file-num", "missing") return newErrManifestCorrupted(fd, "next-file-num", "missing")
case !rec.has(recJournalNum): case !rec.has(recJournalNum):
return newErrManifestCorrupted(m, "journal-file-num", "missing") return newErrManifestCorrupted(fd, "journal-file-num", "missing")
case !rec.has(recSeqNum): case !rec.has(recSeqNum):
return newErrManifestCorrupted(m, "seq-num", "missing") return newErrManifestCorrupted(fd, "seq-num", "missing")
} }
s.manifestFile = m s.manifestFd = fd
s.setVersion(staging.finish()) s.setVersion(staging.finish())
s.setNextFileNum(rec.nextFileNum) s.setNextFileNum(rec.nextFileNum)
s.recordCommited(rec) s.recordCommited(rec)

View file

@ -14,41 +14,46 @@ import (
"github.com/syndtr/goleveldb/leveldb/opt" "github.com/syndtr/goleveldb/leveldb/opt"
) )
func (s *session) pickMemdbLevel(umin, umax []byte) int { func (s *session) pickMemdbLevel(umin, umax []byte, maxLevel int) int {
v := s.version() v := s.version()
defer v.release() defer v.release()
return v.pickMemdbLevel(umin, umax) return v.pickMemdbLevel(umin, umax, maxLevel)
} }
func (s *session) flushMemdb(rec *sessionRecord, mdb *memdb.DB, level int) (level_ int, err error) { func (s *session) flushMemdb(rec *sessionRecord, mdb *memdb.DB, maxLevel int) (int, error) {
// Create sorted table. // Create sorted table.
iter := mdb.NewIterator(nil) iter := mdb.NewIterator(nil)
defer iter.Release() defer iter.Release()
t, n, err := s.tops.createFrom(iter) t, n, err := s.tops.createFrom(iter)
if err != nil { if err != nil {
return level, err return 0, err
} }
// Pick level and add to record. // Pick level other than zero can cause compaction issue with large
if level < 0 { // bulk insert and delete on strictly incrementing key-space. The
level = s.pickMemdbLevel(t.imin.ukey(), t.imax.ukey()) // problem is that the small deletion markers trapped at lower level,
} // while key/value entries keep growing at higher level. Since the
rec.addTableFile(level, t) // key-space is strictly incrementing it will not overlaps with
// higher level, thus maximum possible level is always picked, while
// overlapping deletion marker pushed into lower level.
// See: https://github.com/syndtr/goleveldb/issues/127.
flushLevel := s.pickMemdbLevel(t.imin.ukey(), t.imax.ukey(), maxLevel)
rec.addTableFile(flushLevel, t)
s.logf("memdb@flush created L%d@%d N·%d S·%s %q:%q", level, t.file.Num(), n, shortenb(int(t.size)), t.imin, t.imax) s.logf("memdb@flush created L%d@%d N·%d S·%s %q:%q", flushLevel, t.fd.Num, n, shortenb(int(t.size)), t.imin, t.imax)
return level, nil return flushLevel, nil
} }
// Pick a compaction based on current state; need external synchronization. // Pick a compaction based on current state; need external synchronization.
func (s *session) pickCompaction() *compaction { func (s *session) pickCompaction() *compaction {
v := s.version() v := s.version()
var level int var sourceLevel int
var t0 tFiles var t0 tFiles
if v.cScore >= 1 { if v.cScore >= 1 {
level = v.cLevel sourceLevel = v.cLevel
cptr := s.stCompPtrs[level] cptr := s.getCompPtr(sourceLevel)
tables := v.tables[level] tables := v.levels[sourceLevel]
for _, t := range tables { for _, t := range tables {
if cptr == nil || s.icmp.Compare(t.imax, cptr) > 0 { if cptr == nil || s.icmp.Compare(t.imax, cptr) > 0 {
t0 = append(t0, t) t0 = append(t0, t)
@ -61,7 +66,7 @@ func (s *session) pickCompaction() *compaction {
} else { } else {
if p := atomic.LoadPointer(&v.cSeek); p != nil { if p := atomic.LoadPointer(&v.cSeek); p != nil {
ts := (*tSet)(p) ts := (*tSet)(p)
level = ts.level sourceLevel = ts.level
t0 = append(t0, ts.table) t0 = append(t0, ts.table)
} else { } else {
v.release() v.release()
@ -69,14 +74,19 @@ func (s *session) pickCompaction() *compaction {
} }
} }
return newCompaction(s, v, level, t0) return newCompaction(s, v, sourceLevel, t0)
} }
// Create compaction from given level and range; need external synchronization. // Create compaction from given level and range; need external synchronization.
func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction { func (s *session) getCompactionRange(sourceLevel int, umin, umax []byte, noLimit bool) *compaction {
v := s.version() v := s.version()
t0 := v.tables[level].getOverlaps(nil, s.icmp, umin, umax, level == 0) if sourceLevel >= len(v.levels) {
v.release()
return nil
}
t0 := v.levels[sourceLevel].getOverlaps(nil, s.icmp, umin, umax, sourceLevel == 0)
if len(t0) == 0 { if len(t0) == 0 {
v.release() v.release()
return nil return nil
@ -86,9 +96,9 @@ func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction {
// But we cannot do this for level-0 since level-0 files can overlap // But we cannot do this for level-0 since level-0 files can overlap
// and we must not pick one file and drop another older file if the // and we must not pick one file and drop another older file if the
// two files overlap. // two files overlap.
if level > 0 { if !noLimit && sourceLevel > 0 {
limit := uint64(v.s.o.GetCompactionSourceLimit(level)) limit := int64(v.s.o.GetCompactionSourceLimit(sourceLevel))
total := uint64(0) total := int64(0)
for i, t := range t0 { for i, t := range t0 {
total += t.size total += t.size
if total >= limit { if total >= limit {
@ -99,17 +109,17 @@ func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction {
} }
} }
return newCompaction(s, v, level, t0) return newCompaction(s, v, sourceLevel, t0)
} }
func newCompaction(s *session, v *version, level int, t0 tFiles) *compaction { func newCompaction(s *session, v *version, sourceLevel int, t0 tFiles) *compaction {
c := &compaction{ c := &compaction{
s: s, s: s,
v: v, v: v,
level: level, sourceLevel: sourceLevel,
tables: [2]tFiles{t0, nil}, levels: [2]tFiles{t0, nil},
maxGPOverlaps: uint64(s.o.GetCompactionGPOverlaps(level)), maxGPOverlaps: int64(s.o.GetCompactionGPOverlaps(sourceLevel)),
tPtrs: make([]int, s.o.GetNumLevel()), tPtrs: make([]int, len(v.levels)),
} }
c.expand() c.expand()
c.save() c.save()
@ -121,21 +131,21 @@ type compaction struct {
s *session s *session
v *version v *version
level int sourceLevel int
tables [2]tFiles levels [2]tFiles
maxGPOverlaps uint64 maxGPOverlaps int64
gp tFiles gp tFiles
gpi int gpi int
seenKey bool seenKey bool
gpOverlappedBytes uint64 gpOverlappedBytes int64
imin, imax iKey imin, imax internalKey
tPtrs []int tPtrs []int
released bool released bool
snapGPI int snapGPI int
snapSeenKey bool snapSeenKey bool
snapGPOverlappedBytes uint64 snapGPOverlappedBytes int64
snapTPtrs []int snapTPtrs []int
} }
@ -162,30 +172,34 @@ func (c *compaction) release() {
// Expand compacted tables; need external synchronization. // Expand compacted tables; need external synchronization.
func (c *compaction) expand() { func (c *compaction) expand() {
limit := uint64(c.s.o.GetCompactionExpandLimit(c.level)) limit := int64(c.s.o.GetCompactionExpandLimit(c.sourceLevel))
vt0, vt1 := c.v.tables[c.level], c.v.tables[c.level+1] vt0 := c.v.levels[c.sourceLevel]
vt1 := tFiles{}
if level := c.sourceLevel + 1; level < len(c.v.levels) {
vt1 = c.v.levels[level]
}
t0, t1 := c.tables[0], c.tables[1] t0, t1 := c.levels[0], c.levels[1]
imin, imax := t0.getRange(c.s.icmp) imin, imax := t0.getRange(c.s.icmp)
// We expand t0 here just incase ukey hop across tables. // We expand t0 here just incase ukey hop across tables.
t0 = vt0.getOverlaps(t0, c.s.icmp, imin.ukey(), imax.ukey(), c.level == 0) t0 = vt0.getOverlaps(t0, c.s.icmp, imin.ukey(), imax.ukey(), c.sourceLevel == 0)
if len(t0) != len(c.tables[0]) { if len(t0) != len(c.levels[0]) {
imin, imax = t0.getRange(c.s.icmp) imin, imax = t0.getRange(c.s.icmp)
} }
t1 = vt1.getOverlaps(t1, c.s.icmp, imin.ukey(), imax.ukey(), false) t1 = vt1.getOverlaps(t1, c.s.icmp, imin.ukey(), imax.ukey(), false)
// Get entire range covered by compaction. // Get entire range covered by compaction.
amin, amax := append(t0, t1...).getRange(c.s.icmp) amin, amax := append(t0, t1...).getRange(c.s.icmp)
// See if we can grow the number of inputs in "level" without // See if we can grow the number of inputs in "sourceLevel" without
// changing the number of "level+1" files we pick up. // changing the number of "sourceLevel+1" files we pick up.
if len(t1) > 0 { if len(t1) > 0 {
exp0 := vt0.getOverlaps(nil, c.s.icmp, amin.ukey(), amax.ukey(), c.level == 0) exp0 := vt0.getOverlaps(nil, c.s.icmp, amin.ukey(), amax.ukey(), c.sourceLevel == 0)
if len(exp0) > len(t0) && t1.size()+exp0.size() < limit { if len(exp0) > len(t0) && t1.size()+exp0.size() < limit {
xmin, xmax := exp0.getRange(c.s.icmp) xmin, xmax := exp0.getRange(c.s.icmp)
exp1 := vt1.getOverlaps(nil, c.s.icmp, xmin.ukey(), xmax.ukey(), false) exp1 := vt1.getOverlaps(nil, c.s.icmp, xmin.ukey(), xmax.ukey(), false)
if len(exp1) == len(t1) { if len(exp1) == len(t1) {
c.s.logf("table@compaction expanding L%d+L%d (F·%d S·%s)+(F·%d S·%s) -> (F·%d S·%s)+(F·%d S·%s)", c.s.logf("table@compaction expanding L%d+L%d (F·%d S·%s)+(F·%d S·%s) -> (F·%d S·%s)+(F·%d S·%s)",
c.level, c.level+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())), c.sourceLevel, c.sourceLevel+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())),
len(exp0), shortenb(int(exp0.size())), len(exp1), shortenb(int(exp1.size()))) len(exp0), shortenb(int(exp0.size())), len(exp1), shortenb(int(exp1.size())))
imin, imax = xmin, xmax imin, imax = xmin, xmax
t0, t1 = exp0, exp1 t0, t1 = exp0, exp1
@ -195,22 +209,23 @@ func (c *compaction) expand() {
} }
// Compute the set of grandparent files that overlap this compaction // Compute the set of grandparent files that overlap this compaction
// (parent == level+1; grandparent == level+2) // (parent == sourceLevel+1; grandparent == sourceLevel+2)
if c.level+2 < c.s.o.GetNumLevel() { if level := c.sourceLevel + 2; level < len(c.v.levels) {
c.gp = c.v.tables[c.level+2].getOverlaps(c.gp, c.s.icmp, amin.ukey(), amax.ukey(), false) c.gp = c.v.levels[level].getOverlaps(c.gp, c.s.icmp, amin.ukey(), amax.ukey(), false)
} }
c.tables[0], c.tables[1] = t0, t1 c.levels[0], c.levels[1] = t0, t1
c.imin, c.imax = imin, imax c.imin, c.imax = imin, imax
} }
// Check whether compaction is trivial. // Check whether compaction is trivial.
func (c *compaction) trivial() bool { func (c *compaction) trivial() bool {
return len(c.tables[0]) == 1 && len(c.tables[1]) == 0 && c.gp.size() <= c.maxGPOverlaps return len(c.levels[0]) == 1 && len(c.levels[1]) == 0 && c.gp.size() <= c.maxGPOverlaps
} }
func (c *compaction) baseLevelForKey(ukey []byte) bool { func (c *compaction) baseLevelForKey(ukey []byte) bool {
for level, tables := range c.v.tables[c.level+2:] { for level := c.sourceLevel + 2; level < len(c.v.levels); level++ {
tables := c.v.levels[level]
for c.tPtrs[level] < len(tables) { for c.tPtrs[level] < len(tables) {
t := tables[c.tPtrs[level]] t := tables[c.tPtrs[level]]
if c.s.icmp.uCompare(ukey, t.imax.ukey()) <= 0 { if c.s.icmp.uCompare(ukey, t.imax.ukey()) <= 0 {
@ -227,7 +242,7 @@ func (c *compaction) baseLevelForKey(ukey []byte) bool {
return true return true
} }
func (c *compaction) shouldStopBefore(ikey iKey) bool { func (c *compaction) shouldStopBefore(ikey internalKey) bool {
for ; c.gpi < len(c.gp); c.gpi++ { for ; c.gpi < len(c.gp); c.gpi++ {
gp := c.gp[c.gpi] gp := c.gp[c.gpi]
if c.s.icmp.Compare(ikey, gp.imax) <= 0 { if c.s.icmp.Compare(ikey, gp.imax) <= 0 {
@ -250,10 +265,10 @@ func (c *compaction) shouldStopBefore(ikey iKey) bool {
// Creates an iterator. // Creates an iterator.
func (c *compaction) newIterator() iterator.Iterator { func (c *compaction) newIterator() iterator.Iterator {
// Creates iterator slice. // Creates iterator slice.
icap := len(c.tables) icap := len(c.levels)
if c.level == 0 { if c.sourceLevel == 0 {
// Special case for level-0. // Special case for level-0.
icap = len(c.tables[0]) + 1 icap = len(c.levels[0]) + 1
} }
its := make([]iterator.Iterator, 0, icap) its := make([]iterator.Iterator, 0, icap)
@ -267,13 +282,13 @@ func (c *compaction) newIterator() iterator.Iterator {
ro.Strict |= opt.StrictReader ro.Strict |= opt.StrictReader
} }
for i, tables := range c.tables { for i, tables := range c.levels {
if len(tables) == 0 { if len(tables) == 0 {
continue continue
} }
// Level-0 is not sorted and may overlaps each other. // Level-0 is not sorted and may overlaps each other.
if c.level+i == 0 { if c.sourceLevel+i == 0 {
for _, t := range tables { for _, t := range tables {
its = append(its, c.s.tops.newIterator(t, nil, ro)) its = append(its, c.s.tops.newIterator(t, nil, ro))
} }

View file

@ -13,6 +13,7 @@ import (
"strings" "strings"
"github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/storage"
) )
type byteReader interface { type byteReader interface {
@ -35,28 +36,28 @@ const (
type cpRecord struct { type cpRecord struct {
level int level int
ikey iKey ikey internalKey
} }
type atRecord struct { type atRecord struct {
level int level int
num uint64 num int64
size uint64 size int64
imin iKey imin internalKey
imax iKey imax internalKey
} }
type dtRecord struct { type dtRecord struct {
level int level int
num uint64 num int64
} }
type sessionRecord struct { type sessionRecord struct {
hasRec int hasRec int
comparer string comparer string
journalNum uint64 journalNum int64
prevJournalNum uint64 prevJournalNum int64
nextFileNum uint64 nextFileNum int64
seqNum uint64 seqNum uint64
compPtrs []cpRecord compPtrs []cpRecord
addedTables []atRecord addedTables []atRecord
@ -75,17 +76,17 @@ func (p *sessionRecord) setComparer(name string) {
p.comparer = name p.comparer = name
} }
func (p *sessionRecord) setJournalNum(num uint64) { func (p *sessionRecord) setJournalNum(num int64) {
p.hasRec |= 1 << recJournalNum p.hasRec |= 1 << recJournalNum
p.journalNum = num p.journalNum = num
} }
func (p *sessionRecord) setPrevJournalNum(num uint64) { func (p *sessionRecord) setPrevJournalNum(num int64) {
p.hasRec |= 1 << recPrevJournalNum p.hasRec |= 1 << recPrevJournalNum
p.prevJournalNum = num p.prevJournalNum = num
} }
func (p *sessionRecord) setNextFileNum(num uint64) { func (p *sessionRecord) setNextFileNum(num int64) {
p.hasRec |= 1 << recNextFileNum p.hasRec |= 1 << recNextFileNum
p.nextFileNum = num p.nextFileNum = num
} }
@ -95,7 +96,7 @@ func (p *sessionRecord) setSeqNum(num uint64) {
p.seqNum = num p.seqNum = num
} }
func (p *sessionRecord) addCompPtr(level int, ikey iKey) { func (p *sessionRecord) addCompPtr(level int, ikey internalKey) {
p.hasRec |= 1 << recCompPtr p.hasRec |= 1 << recCompPtr
p.compPtrs = append(p.compPtrs, cpRecord{level, ikey}) p.compPtrs = append(p.compPtrs, cpRecord{level, ikey})
} }
@ -105,13 +106,13 @@ func (p *sessionRecord) resetCompPtrs() {
p.compPtrs = p.compPtrs[:0] p.compPtrs = p.compPtrs[:0]
} }
func (p *sessionRecord) addTable(level int, num, size uint64, imin, imax iKey) { func (p *sessionRecord) addTable(level int, num, size int64, imin, imax internalKey) {
p.hasRec |= 1 << recAddTable p.hasRec |= 1 << recAddTable
p.addedTables = append(p.addedTables, atRecord{level, num, size, imin, imax}) p.addedTables = append(p.addedTables, atRecord{level, num, size, imin, imax})
} }
func (p *sessionRecord) addTableFile(level int, t *tFile) { func (p *sessionRecord) addTableFile(level int, t *tFile) {
p.addTable(level, t.file.Num(), t.size, t.imin, t.imax) p.addTable(level, t.fd.Num, t.size, t.imin, t.imax)
} }
func (p *sessionRecord) resetAddedTables() { func (p *sessionRecord) resetAddedTables() {
@ -119,7 +120,7 @@ func (p *sessionRecord) resetAddedTables() {
p.addedTables = p.addedTables[:0] p.addedTables = p.addedTables[:0]
} }
func (p *sessionRecord) delTable(level int, num uint64) { func (p *sessionRecord) delTable(level int, num int64) {
p.hasRec |= 1 << recDelTable p.hasRec |= 1 << recDelTable
p.deletedTables = append(p.deletedTables, dtRecord{level, num}) p.deletedTables = append(p.deletedTables, dtRecord{level, num})
} }
@ -137,6 +138,13 @@ func (p *sessionRecord) putUvarint(w io.Writer, x uint64) {
_, p.err = w.Write(p.scratch[:n]) _, p.err = w.Write(p.scratch[:n])
} }
func (p *sessionRecord) putVarint(w io.Writer, x int64) {
if x < 0 {
panic("invalid negative value")
}
p.putUvarint(w, uint64(x))
}
func (p *sessionRecord) putBytes(w io.Writer, x []byte) { func (p *sessionRecord) putBytes(w io.Writer, x []byte) {
if p.err != nil { if p.err != nil {
return return
@ -156,11 +164,11 @@ func (p *sessionRecord) encode(w io.Writer) error {
} }
if p.has(recJournalNum) { if p.has(recJournalNum) {
p.putUvarint(w, recJournalNum) p.putUvarint(w, recJournalNum)
p.putUvarint(w, p.journalNum) p.putVarint(w, p.journalNum)
} }
if p.has(recNextFileNum) { if p.has(recNextFileNum) {
p.putUvarint(w, recNextFileNum) p.putUvarint(w, recNextFileNum)
p.putUvarint(w, p.nextFileNum) p.putVarint(w, p.nextFileNum)
} }
if p.has(recSeqNum) { if p.has(recSeqNum) {
p.putUvarint(w, recSeqNum) p.putUvarint(w, recSeqNum)
@ -174,13 +182,13 @@ func (p *sessionRecord) encode(w io.Writer) error {
for _, r := range p.deletedTables { for _, r := range p.deletedTables {
p.putUvarint(w, recDelTable) p.putUvarint(w, recDelTable)
p.putUvarint(w, uint64(r.level)) p.putUvarint(w, uint64(r.level))
p.putUvarint(w, r.num) p.putVarint(w, r.num)
} }
for _, r := range p.addedTables { for _, r := range p.addedTables {
p.putUvarint(w, recAddTable) p.putUvarint(w, recAddTable)
p.putUvarint(w, uint64(r.level)) p.putUvarint(w, uint64(r.level))
p.putUvarint(w, r.num) p.putVarint(w, r.num)
p.putUvarint(w, r.size) p.putVarint(w, r.size)
p.putBytes(w, r.imin) p.putBytes(w, r.imin)
p.putBytes(w, r.imax) p.putBytes(w, r.imax)
} }
@ -194,9 +202,9 @@ func (p *sessionRecord) readUvarintMayEOF(field string, r io.ByteReader, mayEOF
x, err := binary.ReadUvarint(r) x, err := binary.ReadUvarint(r)
if err != nil { if err != nil {
if err == io.ErrUnexpectedEOF || (mayEOF == false && err == io.EOF) { if err == io.ErrUnexpectedEOF || (mayEOF == false && err == io.EOF) {
p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "short read"}) p.err = errors.NewErrCorrupted(storage.FileDesc{}, &ErrManifestCorrupted{field, "short read"})
} else if strings.HasPrefix(err.Error(), "binary:") { } else if strings.HasPrefix(err.Error(), "binary:") {
p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, err.Error()}) p.err = errors.NewErrCorrupted(storage.FileDesc{}, &ErrManifestCorrupted{field, err.Error()})
} else { } else {
p.err = err p.err = err
} }
@ -209,6 +217,14 @@ func (p *sessionRecord) readUvarint(field string, r io.ByteReader) uint64 {
return p.readUvarintMayEOF(field, r, false) return p.readUvarintMayEOF(field, r, false)
} }
func (p *sessionRecord) readVarint(field string, r io.ByteReader) int64 {
x := int64(p.readUvarintMayEOF(field, r, false))
if x < 0 {
p.err = errors.NewErrCorrupted(storage.FileDesc{}, &ErrManifestCorrupted{field, "invalid negative value"})
}
return x
}
func (p *sessionRecord) readBytes(field string, r byteReader) []byte { func (p *sessionRecord) readBytes(field string, r byteReader) []byte {
if p.err != nil { if p.err != nil {
return nil return nil
@ -221,14 +237,14 @@ func (p *sessionRecord) readBytes(field string, r byteReader) []byte {
_, p.err = io.ReadFull(r, x) _, p.err = io.ReadFull(r, x)
if p.err != nil { if p.err != nil {
if p.err == io.ErrUnexpectedEOF { if p.err == io.ErrUnexpectedEOF {
p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "short read"}) p.err = errors.NewErrCorrupted(storage.FileDesc{}, &ErrManifestCorrupted{field, "short read"})
} }
return nil return nil
} }
return x return x
} }
func (p *sessionRecord) readLevel(field string, r io.ByteReader, numLevel int) int { func (p *sessionRecord) readLevel(field string, r io.ByteReader) int {
if p.err != nil { if p.err != nil {
return 0 return 0
} }
@ -236,14 +252,10 @@ func (p *sessionRecord) readLevel(field string, r io.ByteReader, numLevel int) i
if p.err != nil { if p.err != nil {
return 0 return 0
} }
if x >= uint64(numLevel) {
p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "invalid level number"})
return 0
}
return int(x) return int(x)
} }
func (p *sessionRecord) decode(r io.Reader, numLevel int) error { func (p *sessionRecord) decode(r io.Reader) error {
br, ok := r.(byteReader) br, ok := r.(byteReader)
if !ok { if !ok {
br = bufio.NewReader(r) br = bufio.NewReader(r)
@ -264,17 +276,17 @@ func (p *sessionRecord) decode(r io.Reader, numLevel int) error {
p.setComparer(string(x)) p.setComparer(string(x))
} }
case recJournalNum: case recJournalNum:
x := p.readUvarint("journal-num", br) x := p.readVarint("journal-num", br)
if p.err == nil { if p.err == nil {
p.setJournalNum(x) p.setJournalNum(x)
} }
case recPrevJournalNum: case recPrevJournalNum:
x := p.readUvarint("prev-journal-num", br) x := p.readVarint("prev-journal-num", br)
if p.err == nil { if p.err == nil {
p.setPrevJournalNum(x) p.setPrevJournalNum(x)
} }
case recNextFileNum: case recNextFileNum:
x := p.readUvarint("next-file-num", br) x := p.readVarint("next-file-num", br)
if p.err == nil { if p.err == nil {
p.setNextFileNum(x) p.setNextFileNum(x)
} }
@ -284,23 +296,23 @@ func (p *sessionRecord) decode(r io.Reader, numLevel int) error {
p.setSeqNum(x) p.setSeqNum(x)
} }
case recCompPtr: case recCompPtr:
level := p.readLevel("comp-ptr.level", br, numLevel) level := p.readLevel("comp-ptr.level", br)
ikey := p.readBytes("comp-ptr.ikey", br) ikey := p.readBytes("comp-ptr.ikey", br)
if p.err == nil { if p.err == nil {
p.addCompPtr(level, iKey(ikey)) p.addCompPtr(level, internalKey(ikey))
} }
case recAddTable: case recAddTable:
level := p.readLevel("add-table.level", br, numLevel) level := p.readLevel("add-table.level", br)
num := p.readUvarint("add-table.num", br) num := p.readVarint("add-table.num", br)
size := p.readUvarint("add-table.size", br) size := p.readVarint("add-table.size", br)
imin := p.readBytes("add-table.imin", br) imin := p.readBytes("add-table.imin", br)
imax := p.readBytes("add-table.imax", br) imax := p.readBytes("add-table.imax", br)
if p.err == nil { if p.err == nil {
p.addTable(level, num, size, imin, imax) p.addTable(level, num, size, imin, imax)
} }
case recDelTable: case recDelTable:
level := p.readLevel("del-table.level", br, numLevel) level := p.readLevel("del-table.level", br)
num := p.readUvarint("del-table.num", br) num := p.readVarint("del-table.num", br)
if p.err == nil { if p.err == nil {
p.delTable(level, num) p.delTable(level, num)
} }

View file

@ -18,14 +18,14 @@ import (
type dropper struct { type dropper struct {
s *session s *session
file storage.File fd storage.FileDesc
} }
func (d dropper) Drop(err error) { func (d dropper) Drop(err error) {
if e, ok := err.(*journal.ErrCorrupted); ok { if e, ok := err.(*journal.ErrCorrupted); ok {
d.s.logf("journal@drop %s-%d S·%s %q", d.file.Type(), d.file.Num(), shortenb(e.Size), e.Reason) d.s.logf("journal@drop %s-%d S·%s %q", d.fd.Type, d.fd.Num, shortenb(e.Size), e.Reason)
} else { } else {
d.s.logf("journal@drop %s-%d %q", d.file.Type(), d.file.Num(), err) d.s.logf("journal@drop %s-%d %q", d.fd.Type, d.fd.Num, err)
} }
} }
@ -34,25 +34,9 @@ func (s *session) logf(format string, v ...interface{}) { s.stor.Log(fmt.Sprintf
// File utils. // File utils.
func (s *session) getJournalFile(num uint64) storage.File { func (s *session) newTemp() storage.FileDesc {
return s.stor.GetFile(num, storage.TypeJournal) num := atomic.AddInt64(&s.stTempFileNum, 1) - 1
} return storage.FileDesc{storage.TypeTemp, num}
func (s *session) getTableFile(num uint64) storage.File {
return s.stor.GetFile(num, storage.TypeTable)
}
func (s *session) getFiles(t storage.FileType) ([]storage.File, error) {
return s.stor.GetFiles(t)
}
func (s *session) newTemp() storage.File {
num := atomic.AddUint64(&s.stTempFileNum, 1) - 1
return s.stor.GetFile(num, storage.TypeTemp)
}
func (s *session) tableFileFromRecord(r atRecord) *tFile {
return newTableFile(s.getTableFile(r.num), r.size, r.imin, r.imax)
} }
// Session state. // Session state.
@ -80,47 +64,65 @@ func (s *session) setVersion(v *version) {
} }
// Get current unused file number. // Get current unused file number.
func (s *session) nextFileNum() uint64 { func (s *session) nextFileNum() int64 {
return atomic.LoadUint64(&s.stNextFileNum) return atomic.LoadInt64(&s.stNextFileNum)
} }
// Set current unused file number to num. // Set current unused file number to num.
func (s *session) setNextFileNum(num uint64) { func (s *session) setNextFileNum(num int64) {
atomic.StoreUint64(&s.stNextFileNum, num) atomic.StoreInt64(&s.stNextFileNum, num)
} }
// Mark file number as used. // Mark file number as used.
func (s *session) markFileNum(num uint64) { func (s *session) markFileNum(num int64) {
nextFileNum := num + 1 nextFileNum := num + 1
for { for {
old, x := s.stNextFileNum, nextFileNum old, x := s.stNextFileNum, nextFileNum
if old > x { if old > x {
x = old x = old
} }
if atomic.CompareAndSwapUint64(&s.stNextFileNum, old, x) { if atomic.CompareAndSwapInt64(&s.stNextFileNum, old, x) {
break break
} }
} }
} }
// Allocate a file number. // Allocate a file number.
func (s *session) allocFileNum() uint64 { func (s *session) allocFileNum() int64 {
return atomic.AddUint64(&s.stNextFileNum, 1) - 1 return atomic.AddInt64(&s.stNextFileNum, 1) - 1
} }
// Reuse given file number. // Reuse given file number.
func (s *session) reuseFileNum(num uint64) { func (s *session) reuseFileNum(num int64) {
for { for {
old, x := s.stNextFileNum, num old, x := s.stNextFileNum, num
if old != x+1 { if old != x+1 {
x = old x = old
} }
if atomic.CompareAndSwapUint64(&s.stNextFileNum, old, x) { if atomic.CompareAndSwapInt64(&s.stNextFileNum, old, x) {
break break
} }
} }
} }
// Set compaction ptr at given level; need external synchronization.
func (s *session) setCompPtr(level int, ik internalKey) {
if level >= len(s.stCompPtrs) {
newCompPtrs := make([]internalKey, level+1)
copy(newCompPtrs, s.stCompPtrs)
s.stCompPtrs = newCompPtrs
}
s.stCompPtrs[level] = append(internalKey{}, ik...)
}
// Get compaction ptr at given level; need external synchronization.
func (s *session) getCompPtr(level int) internalKey {
if level >= len(s.stCompPtrs) {
return nil
}
return s.stCompPtrs[level]
}
// Manifest related utils. // Manifest related utils.
// Fill given session record obj with current states; need external // Fill given session record obj with current states; need external
@ -149,29 +151,28 @@ func (s *session) fillRecord(r *sessionRecord, snapshot bool) {
// Mark if record has been committed, this will update session state; // Mark if record has been committed, this will update session state;
// need external synchronization. // need external synchronization.
func (s *session) recordCommited(r *sessionRecord) { func (s *session) recordCommited(rec *sessionRecord) {
if r.has(recJournalNum) { if rec.has(recJournalNum) {
s.stJournalNum = r.journalNum s.stJournalNum = rec.journalNum
} }
if r.has(recPrevJournalNum) { if rec.has(recPrevJournalNum) {
s.stPrevJournalNum = r.prevJournalNum s.stPrevJournalNum = rec.prevJournalNum
} }
if r.has(recSeqNum) { if rec.has(recSeqNum) {
s.stSeqNum = r.seqNum s.stSeqNum = rec.seqNum
} }
for _, p := range r.compPtrs { for _, r := range rec.compPtrs {
s.stCompPtrs[p.level] = iKey(p.ikey) s.setCompPtr(r.level, internalKey(r.ikey))
} }
} }
// Create a new manifest file; need external synchronization. // Create a new manifest file; need external synchronization.
func (s *session) newManifest(rec *sessionRecord, v *version) (err error) { func (s *session) newManifest(rec *sessionRecord, v *version) (err error) {
num := s.allocFileNum() fd := storage.FileDesc{storage.TypeManifest, s.allocFileNum()}
file := s.stor.GetFile(num, storage.TypeManifest) writer, err := s.stor.Create(fd)
writer, err := file.Create()
if err != nil { if err != nil {
return return
} }
@ -196,16 +197,16 @@ func (s *session) newManifest(rec *sessionRecord, v *version) (err error) {
if s.manifestWriter != nil { if s.manifestWriter != nil {
s.manifestWriter.Close() s.manifestWriter.Close()
} }
if s.manifestFile != nil { if !s.manifestFd.Nil() {
s.manifestFile.Remove() s.stor.Remove(s.manifestFd)
} }
s.manifestFile = file s.manifestFd = fd
s.manifestWriter = writer s.manifestWriter = writer
s.manifest = jw s.manifest = jw
} else { } else {
writer.Close() writer.Close()
file.Remove() s.stor.Remove(fd)
s.reuseFileNum(num) s.reuseFileNum(fd.Num)
} }
}() }()
@ -221,7 +222,7 @@ func (s *session) newManifest(rec *sessionRecord, v *version) (err error) {
if err != nil { if err != nil {
return return
} }
err = s.stor.SetManifest(file) err = s.stor.SetMeta(fd)
return return
} }

View file

@ -17,11 +17,12 @@ import (
"strings" "strings"
"sync" "sync"
"time" "time"
"github.com/syndtr/goleveldb/leveldb/util"
) )
var errFileOpen = errors.New("leveldb/storage: file still open") var (
errFileOpen = errors.New("leveldb/storage: file still open")
errReadOnly = errors.New("leveldb/storage: storage is read-only")
)
type fileLock interface { type fileLock interface {
release() error release() error
@ -32,23 +33,27 @@ type fileStorageLock struct {
} }
func (lock *fileStorageLock) Release() { func (lock *fileStorageLock) Release() {
fs := lock.fs if lock.fs != nil {
fs.mu.Lock() lock.fs.mu.Lock()
defer fs.mu.Unlock() defer lock.fs.mu.Unlock()
if fs.slock == lock { if lock.fs.slock == lock {
fs.slock = nil lock.fs.slock = nil
} }
return
} }
}
const logSizeThreshold = 1024 * 1024 // 1 MiB
// fileStorage is a file-system backed storage. // fileStorage is a file-system backed storage.
type fileStorage struct { type fileStorage struct {
path string path string
readOnly bool
mu sync.Mutex mu sync.Mutex
flock fileLock flock fileLock
slock *fileStorageLock slock *fileStorageLock
logw *os.File logw *os.File
logSize int64
buf []byte buf []byte
// Opened file counter; if open < 0 means closed. // Opened file counter; if open < 0 means closed.
open int open int
@ -56,16 +61,24 @@ type fileStorage struct {
} }
// OpenFile returns a new filesytem-backed storage implementation with the given // OpenFile returns a new filesytem-backed storage implementation with the given
// path. This also hold a file lock, so any subsequent attempt to open the same // path. This also acquire a file lock, so any subsequent attempt to open the
// path will fail. // same path will fail.
// //
// The storage must be closed after use, by calling Close method. // The storage must be closed after use, by calling Close method.
func OpenFile(path string) (Storage, error) { func OpenFile(path string, readOnly bool) (Storage, error) {
if fi, err := os.Stat(path); err == nil {
if !fi.IsDir() {
return nil, fmt.Errorf("leveldb/storage: open %s: not a directory", path)
}
} else if os.IsNotExist(err) && !readOnly {
if err := os.MkdirAll(path, 0755); err != nil { if err := os.MkdirAll(path, 0755); err != nil {
return nil, err return nil, err
} }
} else {
return nil, err
}
flock, err := newFileLock(filepath.Join(path, "LOCK")) flock, err := newFileLock(filepath.Join(path, "LOCK"), readOnly)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -76,23 +89,42 @@ func OpenFile(path string) (Storage, error) {
} }
}() }()
rename(filepath.Join(path, "LOG"), filepath.Join(path, "LOG.old")) var (
logw, err := os.OpenFile(filepath.Join(path, "LOG"), os.O_WRONLY|os.O_CREATE, 0644) logw *os.File
logSize int64
)
if !readOnly {
logw, err = os.OpenFile(filepath.Join(path, "LOG"), os.O_WRONLY|os.O_CREATE, 0644)
if err != nil { if err != nil {
return nil, err return nil, err
} }
logSize, err = logw.Seek(0, os.SEEK_END)
if err != nil {
logw.Close()
return nil, err
}
}
fs := &fileStorage{path: path, flock: flock, logw: logw} fs := &fileStorage{
path: path,
readOnly: readOnly,
flock: flock,
logw: logw,
logSize: logSize,
}
runtime.SetFinalizer(fs, (*fileStorage).Close) runtime.SetFinalizer(fs, (*fileStorage).Close)
return fs, nil return fs, nil
} }
func (fs *fileStorage) Lock() (util.Releaser, error) { func (fs *fileStorage) Lock() (Lock, error) {
fs.mu.Lock() fs.mu.Lock()
defer fs.mu.Unlock() defer fs.mu.Unlock()
if fs.open < 0 { if fs.open < 0 {
return nil, ErrClosed return nil, ErrClosed
} }
if fs.readOnly {
return &fileStorageLock{}, nil
}
if fs.slock != nil { if fs.slock != nil {
return nil, ErrLocked return nil, ErrLocked
} }
@ -101,7 +133,7 @@ func (fs *fileStorage) Lock() (util.Releaser, error) {
} }
func itoa(buf []byte, i int, wid int) []byte { func itoa(buf []byte, i int, wid int) []byte {
var u uint = uint(i) u := uint(i)
if u == 0 && wid <= 1 { if u == 0 && wid <= 1 {
return append(buf, '0') return append(buf, '0')
} }
@ -126,6 +158,22 @@ func (fs *fileStorage) printDay(t time.Time) {
} }
func (fs *fileStorage) doLog(t time.Time, str string) { func (fs *fileStorage) doLog(t time.Time, str string) {
if fs.logSize > logSizeThreshold {
// Rotate log file.
fs.logw.Close()
fs.logw = nil
fs.logSize = 0
rename(filepath.Join(fs.path, "LOG"), filepath.Join(fs.path, "LOG.old"))
}
if fs.logw == nil {
var err error
fs.logw, err = os.OpenFile(filepath.Join(fs.path, "LOG"), os.O_WRONLY|os.O_CREATE, 0644)
if err != nil {
return
}
// Force printDay on new log file.
fs.day = 0
}
fs.printDay(t) fs.printDay(t)
hour, min, sec := t.Clock() hour, min, sec := t.Clock()
msec := t.Nanosecond() / 1e3 msec := t.Nanosecond() / 1e3
@ -145,6 +193,7 @@ func (fs *fileStorage) doLog(t time.Time, str string) {
} }
func (fs *fileStorage) Log(str string) { func (fs *fileStorage) Log(str string) {
if !fs.readOnly {
t := time.Now() t := time.Now()
fs.mu.Lock() fs.mu.Lock()
defer fs.mu.Unlock() defer fs.mu.Unlock()
@ -153,57 +202,62 @@ func (fs *fileStorage) Log(str string) {
} }
fs.doLog(t, str) fs.doLog(t, str)
} }
}
func (fs *fileStorage) log(str string) { func (fs *fileStorage) log(str string) {
if !fs.readOnly {
fs.doLog(time.Now(), str) fs.doLog(time.Now(), str)
} }
func (fs *fileStorage) GetFile(num uint64, t FileType) File {
return &file{fs: fs, num: num, t: t}
} }
func (fs *fileStorage) GetFiles(t FileType) (ff []File, err error) { func (fs *fileStorage) SetMeta(fd FileDesc) (err error) {
if !FileDescOk(fd) {
return ErrInvalidFile
}
if fs.readOnly {
return errReadOnly
}
fs.mu.Lock() fs.mu.Lock()
defer fs.mu.Unlock() defer fs.mu.Unlock()
if fs.open < 0 { if fs.open < 0 {
return nil, ErrClosed return ErrClosed
}
defer func() {
if err != nil {
fs.log(fmt.Sprintf("CURRENT: %v", err))
}
}()
path := fmt.Sprintf("%s.%d", filepath.Join(fs.path, "CURRENT"), fd.Num)
w, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return
}
_, err = fmt.Fprintln(w, fsGenName(fd))
// Close the file first.
if cerr := w.Close(); cerr != nil {
fs.log(fmt.Sprintf("close CURRENT.%d: %v", fd.Num, cerr))
}
if err != nil {
return
}
return rename(path, filepath.Join(fs.path, "CURRENT"))
}
func (fs *fileStorage) GetMeta() (fd FileDesc, err error) {
fs.mu.Lock()
defer fs.mu.Unlock()
if fs.open < 0 {
return FileDesc{}, ErrClosed
} }
dir, err := os.Open(fs.path) dir, err := os.Open(fs.path)
if err != nil { if err != nil {
return return
} }
fnn, err := dir.Readdirnames(0) names, err := dir.Readdirnames(0)
// Close the dir first before checking for Readdirnames error. // Close the dir first before checking for Readdirnames error.
if err := dir.Close(); err != nil { if ce := dir.Close(); ce != nil {
fs.log(fmt.Sprintf("close dir: %v", err)) fs.log(fmt.Sprintf("close dir: %v", ce))
}
if err != nil {
return
}
f := &file{fs: fs}
for _, fn := range fnn {
if f.parse(fn) && (f.t&t) != 0 {
ff = append(ff, f)
f = &file{fs: fs}
}
}
return
}
func (fs *fileStorage) GetManifest() (f File, err error) {
fs.mu.Lock()
defer fs.mu.Unlock()
if fs.open < 0 {
return nil, ErrClosed
}
dir, err := os.Open(fs.path)
if err != nil {
return
}
fnn, err := dir.Readdirnames(0)
// Close the dir first before checking for Readdirnames error.
if err := dir.Close(); err != nil {
fs.log(fmt.Sprintf("close dir: %v", err))
} }
if err != nil { if err != nil {
return return
@ -212,58 +266,64 @@ func (fs *fileStorage) GetManifest() (f File, err error) {
var rem []string var rem []string
var pend bool var pend bool
var cerr error var cerr error
for _, fn := range fnn { for _, name := range names {
if strings.HasPrefix(fn, "CURRENT") { if strings.HasPrefix(name, "CURRENT") {
pend1 := len(fn) > 7 pend1 := len(name) > 7
var pendNum int64
// Make sure it is valid name for a CURRENT file, otherwise skip it. // Make sure it is valid name for a CURRENT file, otherwise skip it.
if pend1 { if pend1 {
if fn[7] != '.' || len(fn) < 9 { if name[7] != '.' || len(name) < 9 {
fs.log(fmt.Sprintf("skipping %s: invalid file name", fn)) fs.log(fmt.Sprintf("skipping %s: invalid file name", name))
continue continue
} }
if _, e1 := strconv.ParseUint(fn[8:], 10, 0); e1 != nil { var e1 error
fs.log(fmt.Sprintf("skipping %s: invalid file num: %v", fn, e1)) if pendNum, e1 = strconv.ParseInt(name[8:], 10, 0); e1 != nil {
fs.log(fmt.Sprintf("skipping %s: invalid file num: %v", name, e1))
continue continue
} }
} }
path := filepath.Join(fs.path, fn) path := filepath.Join(fs.path, name)
r, e1 := os.OpenFile(path, os.O_RDONLY, 0) r, e1 := os.OpenFile(path, os.O_RDONLY, 0)
if e1 != nil { if e1 != nil {
return nil, e1 return FileDesc{}, e1
} }
b, e1 := ioutil.ReadAll(r) b, e1 := ioutil.ReadAll(r)
if e1 != nil { if e1 != nil {
r.Close() r.Close()
return nil, e1 return FileDesc{}, e1
} }
f1 := &file{fs: fs} var fd1 FileDesc
if len(b) < 1 || b[len(b)-1] != '\n' || !f1.parse(string(b[:len(b)-1])) { if len(b) < 1 || b[len(b)-1] != '\n' || !fsParseNamePtr(string(b[:len(b)-1]), &fd1) {
fs.log(fmt.Sprintf("skipping %s: corrupted or incomplete", fn)) fs.log(fmt.Sprintf("skipping %s: corrupted or incomplete", name))
if pend1 { if pend1 {
rem = append(rem, fn) rem = append(rem, name)
} }
if !pend1 || cerr == nil { if !pend1 || cerr == nil {
metaFd, _ := fsParseName(name)
cerr = &ErrCorrupted{ cerr = &ErrCorrupted{
File: fsParseName(filepath.Base(fn)), Fd: metaFd,
Err: errors.New("leveldb/storage: corrupted or incomplete manifest file"), Err: errors.New("leveldb/storage: corrupted or incomplete meta file"),
} }
} }
} else if f != nil && f1.Num() < f.Num() { } else if pend1 && pendNum != fd1.Num {
fs.log(fmt.Sprintf("skipping %s: obsolete", fn)) fs.log(fmt.Sprintf("skipping %s: inconsistent pending-file num: %d vs %d", name, pendNum, fd1.Num))
rem = append(rem, name)
} else if fd1.Num < fd.Num {
fs.log(fmt.Sprintf("skipping %s: obsolete", name))
if pend1 { if pend1 {
rem = append(rem, fn) rem = append(rem, name)
} }
} else { } else {
f = f1 fd = fd1
pend = pend1 pend = pend1
} }
if err := r.Close(); err != nil { if err := r.Close(); err != nil {
fs.log(fmt.Sprintf("close %s: %v", fn, err)) fs.log(fmt.Sprintf("close %s: %v", name, err))
} }
} }
} }
// Don't remove any files if there is no valid CURRENT file. // Don't remove any files if there is no valid CURRENT file.
if f == nil { if fd.Nil() {
if cerr != nil { if cerr != nil {
err = cerr err = cerr
} else { } else {
@ -271,52 +331,140 @@ func (fs *fileStorage) GetManifest() (f File, err error) {
} }
return return
} }
if !fs.readOnly {
// Rename pending CURRENT file to an effective CURRENT. // Rename pending CURRENT file to an effective CURRENT.
if pend { if pend {
path := fmt.Sprintf("%s.%d", filepath.Join(fs.path, "CURRENT"), f.Num()) path := fmt.Sprintf("%s.%d", filepath.Join(fs.path, "CURRENT"), fd.Num)
if err := rename(path, filepath.Join(fs.path, "CURRENT")); err != nil { if err := rename(path, filepath.Join(fs.path, "CURRENT")); err != nil {
fs.log(fmt.Sprintf("CURRENT.%d -> CURRENT: %v", f.Num(), err)) fs.log(fmt.Sprintf("CURRENT.%d -> CURRENT: %v", fd.Num, err))
} }
} }
// Remove obsolete or incomplete pending CURRENT files. // Remove obsolete or incomplete pending CURRENT files.
for _, fn := range rem { for _, name := range rem {
path := filepath.Join(fs.path, fn) path := filepath.Join(fs.path, name)
if err := os.Remove(path); err != nil { if err := os.Remove(path); err != nil {
fs.log(fmt.Sprintf("remove %s: %v", fn, err)) fs.log(fmt.Sprintf("remove %s: %v", name, err))
}
} }
} }
return return
} }
func (fs *fileStorage) SetManifest(f File) (err error) { func (fs *fileStorage) List(ft FileType) (fds []FileDesc, err error) {
fs.mu.Lock()
defer fs.mu.Unlock()
if fs.open < 0 {
return nil, ErrClosed
}
dir, err := os.Open(fs.path)
if err != nil {
return
}
names, err := dir.Readdirnames(0)
// Close the dir first before checking for Readdirnames error.
if cerr := dir.Close(); cerr != nil {
fs.log(fmt.Sprintf("close dir: %v", cerr))
}
if err == nil {
for _, name := range names {
if fd, ok := fsParseName(name); ok && fd.Type&ft != 0 {
fds = append(fds, fd)
}
}
}
return
}
func (fs *fileStorage) Open(fd FileDesc) (Reader, error) {
if !FileDescOk(fd) {
return nil, ErrInvalidFile
}
fs.mu.Lock()
defer fs.mu.Unlock()
if fs.open < 0 {
return nil, ErrClosed
}
of, err := os.OpenFile(filepath.Join(fs.path, fsGenName(fd)), os.O_RDONLY, 0)
if err != nil {
if fsHasOldName(fd) && os.IsNotExist(err) {
of, err = os.OpenFile(filepath.Join(fs.path, fsGenOldName(fd)), os.O_RDONLY, 0)
if err == nil {
goto ok
}
}
return nil, err
}
ok:
fs.open++
return &fileWrap{File: of, fs: fs, fd: fd}, nil
}
func (fs *fileStorage) Create(fd FileDesc) (Writer, error) {
if !FileDescOk(fd) {
return nil, ErrInvalidFile
}
if fs.readOnly {
return nil, errReadOnly
}
fs.mu.Lock()
defer fs.mu.Unlock()
if fs.open < 0 {
return nil, ErrClosed
}
of, err := os.OpenFile(filepath.Join(fs.path, fsGenName(fd)), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return nil, err
}
fs.open++
return &fileWrap{File: of, fs: fs, fd: fd}, nil
}
func (fs *fileStorage) Remove(fd FileDesc) error {
if !FileDescOk(fd) {
return ErrInvalidFile
}
if fs.readOnly {
return errReadOnly
}
fs.mu.Lock() fs.mu.Lock()
defer fs.mu.Unlock() defer fs.mu.Unlock()
if fs.open < 0 { if fs.open < 0 {
return ErrClosed return ErrClosed
} }
f2, ok := f.(*file) err := os.Remove(filepath.Join(fs.path, fsGenName(fd)))
if !ok || f2.t != TypeManifest { if err != nil {
if fsHasOldName(fd) && os.IsNotExist(err) {
if e1 := os.Remove(filepath.Join(fs.path, fsGenOldName(fd))); !os.IsNotExist(e1) {
fs.log(fmt.Sprintf("remove %s: %v (old name)", fd, err))
err = e1
}
} else {
fs.log(fmt.Sprintf("remove %s: %v", fd, err))
}
}
return err
}
func (fs *fileStorage) Rename(oldfd, newfd FileDesc) error {
if !FileDescOk(oldfd) || !FileDescOk(newfd) {
return ErrInvalidFile return ErrInvalidFile
} }
defer func() { if oldfd == newfd {
if err != nil { return nil
fs.log(fmt.Sprintf("CURRENT: %v", err))
} }
}() if fs.readOnly {
path := fmt.Sprintf("%s.%d", filepath.Join(fs.path, "CURRENT"), f2.Num()) return errReadOnly
w, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return err
} }
_, err = fmt.Fprintln(w, f2.name())
// Close the file first. fs.mu.Lock()
if err := w.Close(); err != nil { defer fs.mu.Unlock()
fs.log(fmt.Sprintf("close CURRENT.%d: %v", f2.num, err)) if fs.open < 0 {
return ErrClosed
} }
if err != nil { return rename(filepath.Join(fs.path, fsGenName(oldfd)), filepath.Join(fs.path, fsGenName(newfd)))
return err
}
return rename(path, filepath.Join(fs.path, "CURRENT"))
} }
func (fs *fileStorage) Close() error { func (fs *fileStorage) Close() error {
@ -332,212 +480,104 @@ func (fs *fileStorage) Close() error {
fs.log(fmt.Sprintf("close: warning, %d files still open", fs.open)) fs.log(fmt.Sprintf("close: warning, %d files still open", fs.open))
} }
fs.open = -1 fs.open = -1
e1 := fs.logw.Close() if fs.logw != nil {
err := fs.flock.release() fs.logw.Close()
if err == nil {
err = e1
} }
return err return fs.flock.release()
} }
type fileWrap struct { type fileWrap struct {
*os.File *os.File
f *file fs *fileStorage
fd FileDesc
closed bool
} }
func (fw fileWrap) Sync() error { func (fw *fileWrap) Sync() error {
if err := fw.File.Sync(); err != nil { if err := fw.File.Sync(); err != nil {
return err return err
} }
if fw.f.Type() == TypeManifest { if fw.fd.Type == TypeManifest {
// Also sync parent directory if file type is manifest. // Also sync parent directory if file type is manifest.
// See: https://code.google.com/p/leveldb/issues/detail?id=190. // See: https://code.google.com/p/leveldb/issues/detail?id=190.
if err := syncDir(fw.f.fs.path); err != nil { if err := syncDir(fw.fs.path); err != nil {
fw.fs.log(fmt.Sprintf("syncDir: %v", err))
return err return err
} }
} }
return nil return nil
} }
func (fw fileWrap) Close() error { func (fw *fileWrap) Close() error {
f := fw.f fw.fs.mu.Lock()
f.fs.mu.Lock() defer fw.fs.mu.Unlock()
defer f.fs.mu.Unlock() if fw.closed {
if !f.open {
return ErrClosed return ErrClosed
} }
f.open = false fw.closed = true
f.fs.open-- fw.fs.open--
err := fw.File.Close() err := fw.File.Close()
if err != nil { if err != nil {
f.fs.log(fmt.Sprintf("close %s.%d: %v", f.Type(), f.Num(), err)) fw.fs.log(fmt.Sprintf("close %s: %v", fw.fd, err))
} }
return err return err
} }
type file struct { func fsGenName(fd FileDesc) string {
fs *fileStorage switch fd.Type {
num uint64
t FileType
open bool
}
func (f *file) Open() (Reader, error) {
f.fs.mu.Lock()
defer f.fs.mu.Unlock()
if f.fs.open < 0 {
return nil, ErrClosed
}
if f.open {
return nil, errFileOpen
}
of, err := os.OpenFile(f.path(), os.O_RDONLY, 0)
if err != nil {
if f.hasOldName() && os.IsNotExist(err) {
of, err = os.OpenFile(f.oldPath(), os.O_RDONLY, 0)
if err == nil {
goto ok
}
}
return nil, err
}
ok:
f.open = true
f.fs.open++
return fileWrap{of, f}, nil
}
func (f *file) Create() (Writer, error) {
f.fs.mu.Lock()
defer f.fs.mu.Unlock()
if f.fs.open < 0 {
return nil, ErrClosed
}
if f.open {
return nil, errFileOpen
}
of, err := os.OpenFile(f.path(), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return nil, err
}
f.open = true
f.fs.open++
return fileWrap{of, f}, nil
}
func (f *file) Replace(newfile File) error {
f.fs.mu.Lock()
defer f.fs.mu.Unlock()
if f.fs.open < 0 {
return ErrClosed
}
newfile2, ok := newfile.(*file)
if !ok {
return ErrInvalidFile
}
if f.open || newfile2.open {
return errFileOpen
}
return rename(newfile2.path(), f.path())
}
func (f *file) Type() FileType {
return f.t
}
func (f *file) Num() uint64 {
return f.num
}
func (f *file) Remove() error {
f.fs.mu.Lock()
defer f.fs.mu.Unlock()
if f.fs.open < 0 {
return ErrClosed
}
if f.open {
return errFileOpen
}
err := os.Remove(f.path())
if err != nil {
f.fs.log(fmt.Sprintf("remove %s.%d: %v", f.Type(), f.Num(), err))
}
// Also try remove file with old name, just in case.
if f.hasOldName() {
if e1 := os.Remove(f.oldPath()); !os.IsNotExist(e1) {
f.fs.log(fmt.Sprintf("remove %s.%d: %v (old name)", f.Type(), f.Num(), err))
err = e1
}
}
return err
}
func (f *file) hasOldName() bool {
return f.t == TypeTable
}
func (f *file) oldName() string {
switch f.t {
case TypeTable:
return fmt.Sprintf("%06d.sst", f.num)
}
return f.name()
}
func (f *file) oldPath() string {
return filepath.Join(f.fs.path, f.oldName())
}
func (f *file) name() string {
switch f.t {
case TypeManifest: case TypeManifest:
return fmt.Sprintf("MANIFEST-%06d", f.num) return fmt.Sprintf("MANIFEST-%06d", fd.Num)
case TypeJournal: case TypeJournal:
return fmt.Sprintf("%06d.log", f.num) return fmt.Sprintf("%06d.log", fd.Num)
case TypeTable: case TypeTable:
return fmt.Sprintf("%06d.ldb", f.num) return fmt.Sprintf("%06d.ldb", fd.Num)
case TypeTemp: case TypeTemp:
return fmt.Sprintf("%06d.tmp", f.num) return fmt.Sprintf("%06d.tmp", fd.Num)
default: default:
panic("invalid file type") panic("invalid file type")
} }
} }
func (f *file) path() string { func fsHasOldName(fd FileDesc) bool {
return filepath.Join(f.fs.path, f.name()) return fd.Type == TypeTable
} }
func fsParseName(name string) *FileInfo { func fsGenOldName(fd FileDesc) string {
fi := &FileInfo{} switch fd.Type {
case TypeTable:
return fmt.Sprintf("%06d.sst", fd.Num)
}
return fsGenName(fd)
}
func fsParseName(name string) (fd FileDesc, ok bool) {
var tail string var tail string
_, err := fmt.Sscanf(name, "%d.%s", &fi.Num, &tail) _, err := fmt.Sscanf(name, "%d.%s", &fd.Num, &tail)
if err == nil { if err == nil {
switch tail { switch tail {
case "log": case "log":
fi.Type = TypeJournal fd.Type = TypeJournal
case "ldb", "sst": case "ldb", "sst":
fi.Type = TypeTable fd.Type = TypeTable
case "tmp": case "tmp":
fi.Type = TypeTemp fd.Type = TypeTemp
default: default:
return nil return
} }
return fi return fd, true
} }
n, _ := fmt.Sscanf(name, "MANIFEST-%d%s", &fi.Num, &tail) n, _ := fmt.Sscanf(name, "MANIFEST-%d%s", &fd.Num, &tail)
if n == 1 { if n == 1 {
fi.Type = TypeManifest fd.Type = TypeManifest
return fi return fd, true
} }
return nil return
} }
func (f *file) parse(name string) bool { func fsParseNamePtr(name string, fd *FileDesc) bool {
fi := fsParseName(name) _fd, ok := fsParseName(name)
if fi == nil { if fd != nil {
return false *fd = _fd
} }
f.t = fi.Type return ok
f.num = fi.Num
return true
} }

View file

@ -0,0 +1,34 @@
// Copyright (c) 2012, Suryandaru Triandana <syndtr@gmail.com>
// All rights reserved.
//
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// +build nacl
package storage
import (
"os"
"syscall"
)
func newFileLock(path string, readOnly bool) (fl fileLock, err error) {
return nil, syscall.ENOTSUP
}
func setFileLock(f *os.File, readOnly, lock bool) error {
return syscall.ENOTSUP
}
func rename(oldpath, newpath string) error {
return syscall.ENOTSUP
}
func isErrInvalid(err error) bool {
return false
}
func syncDir(name string) error {
return syscall.ENOTSUP
}

View file

@ -19,8 +19,21 @@ func (fl *plan9FileLock) release() error {
return fl.f.Close() return fl.f.Close()
} }
func newFileLock(path string) (fl fileLock, err error) { func newFileLock(path string, readOnly bool) (fl fileLock, err error) {
f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, os.ModeExclusive|0644) var (
flag int
perm os.FileMode
)
if readOnly {
flag = os.O_RDONLY
} else {
flag = os.O_RDWR
perm = os.ModeExclusive
}
f, err := os.OpenFile(path, flag, perm)
if os.IsNotExist(err) {
f, err = os.OpenFile(path, flag|os.O_CREATE, perm|0644)
}
if err != nil { if err != nil {
return return
} }

View file

@ -18,18 +18,27 @@ type unixFileLock struct {
} }
func (fl *unixFileLock) release() error { func (fl *unixFileLock) release() error {
if err := setFileLock(fl.f, false); err != nil { if err := setFileLock(fl.f, false, false); err != nil {
return err return err
} }
return fl.f.Close() return fl.f.Close()
} }
func newFileLock(path string) (fl fileLock, err error) { func newFileLock(path string, readOnly bool) (fl fileLock, err error) {
f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0644) var flag int
if readOnly {
flag = os.O_RDONLY
} else {
flag = os.O_RDWR
}
f, err := os.OpenFile(path, flag, 0)
if os.IsNotExist(err) {
f, err = os.OpenFile(path, flag|os.O_CREATE, 0644)
}
if err != nil { if err != nil {
return return
} }
err = setFileLock(f, true) err = setFileLock(f, readOnly, true)
if err != nil { if err != nil {
f.Close() f.Close()
return return
@ -38,7 +47,7 @@ func newFileLock(path string) (fl fileLock, err error) {
return return
} }
func setFileLock(f *os.File, lock bool) error { func setFileLock(f *os.File, readOnly, lock bool) error {
flock := syscall.Flock_t{ flock := syscall.Flock_t{
Type: syscall.F_UNLCK, Type: syscall.F_UNLCK,
Start: 0, Start: 0,
@ -46,8 +55,12 @@ func setFileLock(f *os.File, lock bool) error {
Whence: 1, Whence: 1,
} }
if lock { if lock {
if readOnly {
flock.Type = syscall.F_RDLCK
} else {
flock.Type = syscall.F_WRLCK flock.Type = syscall.F_WRLCK
} }
}
return syscall.FcntlFlock(f.Fd(), syscall.F_SETLK, &flock) return syscall.FcntlFlock(f.Fd(), syscall.F_SETLK, &flock)
} }

View file

@ -18,18 +18,27 @@ type unixFileLock struct {
} }
func (fl *unixFileLock) release() error { func (fl *unixFileLock) release() error {
if err := setFileLock(fl.f, false); err != nil { if err := setFileLock(fl.f, false, false); err != nil {
return err return err
} }
return fl.f.Close() return fl.f.Close()
} }
func newFileLock(path string) (fl fileLock, err error) { func newFileLock(path string, readOnly bool) (fl fileLock, err error) {
f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0644) var flag int
if readOnly {
flag = os.O_RDONLY
} else {
flag = os.O_RDWR
}
f, err := os.OpenFile(path, flag, 0)
if os.IsNotExist(err) {
f, err = os.OpenFile(path, flag|os.O_CREATE, 0644)
}
if err != nil { if err != nil {
return return
} }
err = setFileLock(f, true) err = setFileLock(f, readOnly, true)
if err != nil { if err != nil {
f.Close() f.Close()
return return
@ -38,11 +47,15 @@ func newFileLock(path string) (fl fileLock, err error) {
return return
} }
func setFileLock(f *os.File, lock bool) error { func setFileLock(f *os.File, readOnly, lock bool) error {
how := syscall.LOCK_UN how := syscall.LOCK_UN
if lock { if lock {
if readOnly {
how = syscall.LOCK_SH
} else {
how = syscall.LOCK_EX how = syscall.LOCK_EX
} }
}
return syscall.Flock(int(f.Fd()), how|syscall.LOCK_NB) return syscall.Flock(int(f.Fd()), how|syscall.LOCK_NB)
} }

View file

@ -29,12 +29,22 @@ func (fl *windowsFileLock) release() error {
return syscall.Close(fl.fd) return syscall.Close(fl.fd)
} }
func newFileLock(path string) (fl fileLock, err error) { func newFileLock(path string, readOnly bool) (fl fileLock, err error) {
pathp, err := syscall.UTF16PtrFromString(path) pathp, err := syscall.UTF16PtrFromString(path)
if err != nil { if err != nil {
return return
} }
fd, err := syscall.CreateFile(pathp, syscall.GENERIC_READ|syscall.GENERIC_WRITE, 0, nil, syscall.CREATE_ALWAYS, syscall.FILE_ATTRIBUTE_NORMAL, 0) var access, shareMode uint32
if readOnly {
access = syscall.GENERIC_READ
shareMode = syscall.FILE_SHARE_READ
} else {
access = syscall.GENERIC_READ | syscall.GENERIC_WRITE
}
fd, err := syscall.CreateFile(pathp, access, shareMode, nil, syscall.OPEN_EXISTING, syscall.FILE_ATTRIBUTE_NORMAL, 0)
if err == syscall.ERROR_FILE_NOT_FOUND {
fd, err = syscall.CreateFile(pathp, access, shareMode, nil, syscall.OPEN_ALWAYS, syscall.FILE_ATTRIBUTE_NORMAL, 0)
}
if err != nil { if err != nil {
return return
} }
@ -47,9 +57,8 @@ func moveFileEx(from *uint16, to *uint16, flags uint32) error {
if r1 == 0 { if r1 == 0 {
if e1 != 0 { if e1 != 0 {
return error(e1) return error(e1)
} else {
return syscall.EINVAL
} }
return syscall.EINVAL
} }
return nil return nil
} }

View file

@ -10,8 +10,6 @@ import (
"bytes" "bytes"
"os" "os"
"sync" "sync"
"github.com/syndtr/goleveldb/leveldb/util"
) )
const typeShift = 3 const typeShift = 3
@ -35,7 +33,7 @@ type memStorage struct {
mu sync.Mutex mu sync.Mutex
slock *memStorageLock slock *memStorageLock
files map[uint64]*memFile files map[uint64]*memFile
manifest *memFilePtr meta FileDesc
} }
// NewMemStorage returns a new memory-backed storage implementation. // NewMemStorage returns a new memory-backed storage implementation.
@ -45,7 +43,7 @@ func NewMemStorage() Storage {
} }
} }
func (ms *memStorage) Lock() (util.Releaser, error) { func (ms *memStorage) Lock() (Lock, error) {
ms.mu.Lock() ms.mu.Lock()
defer ms.mu.Unlock() defer ms.mu.Unlock()
if ms.slock != nil { if ms.slock != nil {
@ -57,147 +55,164 @@ func (ms *memStorage) Lock() (util.Releaser, error) {
func (*memStorage) Log(str string) {} func (*memStorage) Log(str string) {}
func (ms *memStorage) GetFile(num uint64, t FileType) File { func (ms *memStorage) SetMeta(fd FileDesc) error {
return &memFilePtr{ms: ms, num: num, t: t} if !FileDescOk(fd) {
}
func (ms *memStorage) GetFiles(t FileType) ([]File, error) {
ms.mu.Lock()
var ff []File
for x, _ := range ms.files {
num, mt := x>>typeShift, FileType(x)&TypeAll
if mt&t == 0 {
continue
}
ff = append(ff, &memFilePtr{ms: ms, num: num, t: mt})
}
ms.mu.Unlock()
return ff, nil
}
func (ms *memStorage) GetManifest() (File, error) {
ms.mu.Lock()
defer ms.mu.Unlock()
if ms.manifest == nil {
return nil, os.ErrNotExist
}
return ms.manifest, nil
}
func (ms *memStorage) SetManifest(f File) error {
fm, ok := f.(*memFilePtr)
if !ok || fm.t != TypeManifest {
return ErrInvalidFile return ErrInvalidFile
} }
ms.mu.Lock() ms.mu.Lock()
ms.manifest = fm ms.meta = fd
ms.mu.Unlock() ms.mu.Unlock()
return nil return nil
} }
func (*memStorage) Close() error { return nil } func (ms *memStorage) GetMeta() (FileDesc, error) {
type memReader struct {
*bytes.Reader
m *memFile
}
func (mr *memReader) Close() error {
return mr.m.Close()
}
type memFile struct {
bytes.Buffer
ms *memStorage
open bool
}
func (*memFile) Sync() error { return nil }
func (m *memFile) Close() error {
m.ms.mu.Lock()
m.open = false
m.ms.mu.Unlock()
return nil
}
type memFilePtr struct {
ms *memStorage
num uint64
t FileType
}
func (p *memFilePtr) x() uint64 {
return p.Num()<<typeShift | uint64(p.Type())
}
func (p *memFilePtr) Open() (Reader, error) {
ms := p.ms
ms.mu.Lock() ms.mu.Lock()
defer ms.mu.Unlock() defer ms.mu.Unlock()
if m, exist := ms.files[p.x()]; exist { if ms.meta.Nil() {
return FileDesc{}, os.ErrNotExist
}
return ms.meta, nil
}
func (ms *memStorage) List(ft FileType) ([]FileDesc, error) {
ms.mu.Lock()
var fds []FileDesc
for x, _ := range ms.files {
fd := unpackFile(x)
if fd.Type&ft != 0 {
fds = append(fds, fd)
}
}
ms.mu.Unlock()
return fds, nil
}
func (ms *memStorage) Open(fd FileDesc) (Reader, error) {
if !FileDescOk(fd) {
return nil, ErrInvalidFile
}
ms.mu.Lock()
defer ms.mu.Unlock()
if m, exist := ms.files[packFile(fd)]; exist {
if m.open { if m.open {
return nil, errFileOpen return nil, errFileOpen
} }
m.open = true m.open = true
return &memReader{Reader: bytes.NewReader(m.Bytes()), m: m}, nil return &memReader{Reader: bytes.NewReader(m.Bytes()), ms: ms, m: m}, nil
} }
return nil, os.ErrNotExist return nil, os.ErrNotExist
} }
func (p *memFilePtr) Create() (Writer, error) { func (ms *memStorage) Create(fd FileDesc) (Writer, error) {
ms := p.ms if !FileDescOk(fd) {
return nil, ErrInvalidFile
}
x := packFile(fd)
ms.mu.Lock() ms.mu.Lock()
defer ms.mu.Unlock() defer ms.mu.Unlock()
m, exist := ms.files[p.x()] m, exist := ms.files[x]
if exist { if exist {
if m.open { if m.open {
return nil, errFileOpen return nil, errFileOpen
} }
m.Reset() m.Reset()
} else { } else {
m = &memFile{ms: ms} m = &memFile{}
ms.files[p.x()] = m ms.files[x] = m
} }
m.open = true m.open = true
return m, nil return &memWriter{memFile: m, ms: ms}, nil
} }
func (p *memFilePtr) Replace(newfile File) error { func (ms *memStorage) Remove(fd FileDesc) error {
p1, ok := newfile.(*memFilePtr) if !FileDescOk(fd) {
if !ok {
return ErrInvalidFile return ErrInvalidFile
} }
ms := p.ms
x := packFile(fd)
ms.mu.Lock() ms.mu.Lock()
defer ms.mu.Unlock() defer ms.mu.Unlock()
m1, exist := ms.files[p1.x()] if _, exist := ms.files[x]; exist {
delete(ms.files, x)
return nil
}
return os.ErrNotExist
}
func (ms *memStorage) Rename(oldfd, newfd FileDesc) error {
if FileDescOk(oldfd) || FileDescOk(newfd) {
return ErrInvalidFile
}
if oldfd == newfd {
return nil
}
oldx := packFile(oldfd)
newx := packFile(newfd)
ms.mu.Lock()
defer ms.mu.Unlock()
oldm, exist := ms.files[oldx]
if !exist { if !exist {
return os.ErrNotExist return os.ErrNotExist
} }
m0, exist := ms.files[p.x()] newm, exist := ms.files[newx]
if (exist && m0.open) || m1.open { if (exist && newm.open) || oldm.open {
return errFileOpen return errFileOpen
} }
delete(ms.files, p1.x()) delete(ms.files, oldx)
ms.files[p.x()] = m1 ms.files[newx] = oldm
return nil return nil
} }
func (p *memFilePtr) Type() FileType { func (*memStorage) Close() error { return nil }
return p.t
type memFile struct {
bytes.Buffer
open bool
} }
func (p *memFilePtr) Num() uint64 { type memReader struct {
return p.num *bytes.Reader
ms *memStorage
m *memFile
closed bool
} }
func (p *memFilePtr) Remove() error { func (mr *memReader) Close() error {
ms := p.ms mr.ms.mu.Lock()
ms.mu.Lock() defer mr.ms.mu.Unlock()
defer ms.mu.Unlock() if mr.closed {
if _, exist := ms.files[p.x()]; exist { return ErrClosed
delete(ms.files, p.x()) }
mr.m.open = false
return nil return nil
} }
return os.ErrNotExist
type memWriter struct {
*memFile
ms *memStorage
closed bool
}
func (*memWriter) Sync() error { return nil }
func (mw *memWriter) Close() error {
mw.ms.mu.Lock()
defer mw.ms.mu.Unlock()
if mw.closed {
return ErrClosed
}
mw.memFile.open = false
return nil
}
func packFile(fd FileDesc) uint64 {
return uint64(fd.Num)<<typeShift | uint64(fd.Type)
}
func unpackFile(x uint64) FileDesc {
return FileDesc{FileType(x) & TypeAll, int64(x >> typeShift)}
} }

View file

@ -15,7 +15,7 @@ import (
"github.com/syndtr/goleveldb/leveldb/util" "github.com/syndtr/goleveldb/leveldb/util"
) )
type FileType uint32 type FileType int
const ( const (
TypeManifest FileType = 1 << iota TypeManifest FileType = 1 << iota
@ -50,13 +50,13 @@ var (
// a file. Package storage has its own type instead of using // a file. Package storage has its own type instead of using
// errors.ErrCorrupted to prevent circular import. // errors.ErrCorrupted to prevent circular import.
type ErrCorrupted struct { type ErrCorrupted struct {
File *FileInfo Fd FileDesc
Err error Err error
} }
func (e *ErrCorrupted) Error() string { func (e *ErrCorrupted) Error() string {
if e.File != nil { if !e.Fd.Nil() {
return fmt.Sprintf("%v [file=%v]", e.Err, e.File) return fmt.Sprintf("%v [file=%v]", e.Err, e.Fd)
} else { } else {
return e.Err.Error() return e.Err.Error()
} }
@ -83,31 +83,47 @@ type Writer interface {
Syncer Syncer
} }
// File is the file. A file instance must be goroutine-safe. type Lock interface {
type File interface { util.Releaser
// Open opens the file for read. Returns os.ErrNotExist error }
// if the file does not exist.
// Returns ErrClosed if the underlying storage is closed.
Open() (r Reader, err error)
// Create creates the file for writting. Truncate the file if // FileDesc is a file descriptor.
// already exist. type FileDesc struct {
// Returns ErrClosed if the underlying storage is closed. Type FileType
Create() (w Writer, err error) Num int64
}
// Replace replaces file with newfile. func (fd FileDesc) String() string {
// Returns ErrClosed if the underlying storage is closed. switch fd.Type {
Replace(newfile File) error case TypeManifest:
return fmt.Sprintf("MANIFEST-%06d", fd.Num)
case TypeJournal:
return fmt.Sprintf("%06d.log", fd.Num)
case TypeTable:
return fmt.Sprintf("%06d.ldb", fd.Num)
case TypeTemp:
return fmt.Sprintf("%06d.tmp", fd.Num)
default:
return fmt.Sprintf("%#x-%d", fd.Type, fd.Num)
}
}
// Type returns the file type // Nil returns true if fd == (FileDesc{}).
Type() FileType func (fd FileDesc) Nil() bool {
return fd == (FileDesc{})
}
// Num returns the file number. // FileDescOk returns true if fd is a valid file descriptor.
Num() uint64 func FileDescOk(fd FileDesc) bool {
switch fd.Type {
// Remove removes the file. case TypeManifest:
// Returns ErrClosed if the underlying storage is closed. case TypeJournal:
Remove() error case TypeTable:
case TypeTemp:
default:
return false
}
return fd.Num >= 0
} }
// Storage is the storage. A storage instance must be goroutine-safe. // Storage is the storage. A storage instance must be goroutine-safe.
@ -115,59 +131,47 @@ type Storage interface {
// Lock locks the storage. Any subsequent attempt to call Lock will fail // Lock locks the storage. Any subsequent attempt to call Lock will fail
// until the last lock released. // until the last lock released.
// After use the caller should call the Release method. // After use the caller should call the Release method.
Lock() (l util.Releaser, err error) Lock() (Lock, error)
// Log logs a string. This is used for logging. An implementation // Log logs a string. This is used for logging.
// may write to a file, stdout or simply do nothing. // An implementation may write to a file, stdout or simply do nothing.
Log(str string) Log(str string)
// GetFile returns a file for the given number and type. GetFile will never // SetMeta sets to point to the given fd, which then can be acquired using
// returns nil, even if the underlying storage is closed. // GetMeta method.
GetFile(num uint64, t FileType) File // SetMeta should be implemented in such way that changes should happened
// atomically.
SetMeta(fd FileDesc) error
// GetFiles returns a slice of files that match the given file types. // GetManifest returns a manifest file.
// Returns os.ErrNotExist if meta doesn't point to any fd, or point to fd
// that doesn't exist.
GetMeta() (FileDesc, error)
// List returns fds that match the given file types.
// The file types may be OR'ed together. // The file types may be OR'ed together.
GetFiles(t FileType) ([]File, error) List(ft FileType) ([]FileDesc, error)
// GetManifest returns a manifest file. Returns os.ErrNotExist if manifest // Open opens file with the given fd read-only.
// file does not exist. // Returns os.ErrNotExist error if the file does not exist.
GetManifest() (File, error) // Returns ErrClosed if the underlying storage is closed.
Open(fd FileDesc) (Reader, error)
// SetManifest sets the given file as manifest file. The given file should // Create creates file with the given fd, truncate if already exist and
// be a manifest file type or error will be returned. // opens write-only.
SetManifest(f File) error // Returns ErrClosed if the underlying storage is closed.
Create(fd FileDesc) (Writer, error)
// Close closes the storage. It is valid to call Close multiple times. // Remove removes file with the given fd.
// Other methods should not be called after the storage has been closed. // Returns ErrClosed if the underlying storage is closed.
Remove(fd FileDesc) error
// Rename renames file from oldfd to newfd.
// Returns ErrClosed if the underlying storage is closed.
Rename(oldfd, newfd FileDesc) error
// Close closes the storage.
// It is valid to call Close multiple times. Other methods should not be
// called after the storage has been closed.
Close() error Close() error
} }
// FileInfo wraps basic file info.
type FileInfo struct {
Type FileType
Num uint64
}
func (fi FileInfo) String() string {
switch fi.Type {
case TypeManifest:
return fmt.Sprintf("MANIFEST-%06d", fi.Num)
case TypeJournal:
return fmt.Sprintf("%06d.log", fi.Num)
case TypeTable:
return fmt.Sprintf("%06d.ldb", fi.Num)
case TypeTemp:
return fmt.Sprintf("%06d.tmp", fi.Num)
default:
return fmt.Sprintf("%#x-%d", fi.Type, fi.Num)
}
}
// NewFileInfo creates new FileInfo from the given File. It will returns nil
// if File is nil.
func NewFileInfo(f File) *FileInfo {
if f == nil {
return nil
}
return &FileInfo{f.Type(), f.Num()}
}

View file

@ -21,10 +21,10 @@ import (
// tFile holds basic information about a table. // tFile holds basic information about a table.
type tFile struct { type tFile struct {
file storage.File fd storage.FileDesc
seekLeft int32 seekLeft int32
size uint64 size int64
imin, imax iKey imin, imax internalKey
} }
// Returns true if given key is after largest key of this table. // Returns true if given key is after largest key of this table.
@ -48,9 +48,9 @@ func (t *tFile) consumeSeek() int32 {
} }
// Creates new tFile. // Creates new tFile.
func newTableFile(file storage.File, size uint64, imin, imax iKey) *tFile { func newTableFile(fd storage.FileDesc, size int64, imin, imax internalKey) *tFile {
f := &tFile{ f := &tFile{
file: file, fd: fd,
size: size, size: size,
imin: imin, imin: imin,
imax: imax, imax: imax,
@ -77,6 +77,10 @@ func newTableFile(file storage.File, size uint64, imin, imax iKey) *tFile {
return f return f
} }
func tableFileFromRecord(r atRecord) *tFile {
return newTableFile(storage.FileDesc{storage.TypeTable, r.num}, r.size, r.imin, r.imax)
}
// tFiles hold multiple tFile. // tFiles hold multiple tFile.
type tFiles []*tFile type tFiles []*tFile
@ -89,7 +93,7 @@ func (tf tFiles) nums() string {
if i != 0 { if i != 0 {
x += ", " x += ", "
} }
x += fmt.Sprint(f.file.Num()) x += fmt.Sprint(f.fd.Num)
} }
x += " ]" x += " ]"
return x return x
@ -101,7 +105,7 @@ func (tf tFiles) lessByKey(icmp *iComparer, i, j int) bool {
a, b := tf[i], tf[j] a, b := tf[i], tf[j]
n := icmp.Compare(a.imin, b.imin) n := icmp.Compare(a.imin, b.imin)
if n == 0 { if n == 0 {
return a.file.Num() < b.file.Num() return a.fd.Num < b.fd.Num
} }
return n < 0 return n < 0
} }
@ -109,7 +113,7 @@ func (tf tFiles) lessByKey(icmp *iComparer, i, j int) bool {
// Returns true if i file number is greater than j. // Returns true if i file number is greater than j.
// This used for sort by file number in descending order. // This used for sort by file number in descending order.
func (tf tFiles) lessByNum(i, j int) bool { func (tf tFiles) lessByNum(i, j int) bool {
return tf[i].file.Num() > tf[j].file.Num() return tf[i].fd.Num > tf[j].fd.Num
} }
// Sorts tables by key in ascending order. // Sorts tables by key in ascending order.
@ -123,7 +127,7 @@ func (tf tFiles) sortByNum() {
} }
// Returns sum of all tables size. // Returns sum of all tables size.
func (tf tFiles) size() (sum uint64) { func (tf tFiles) size() (sum int64) {
for _, t := range tf { for _, t := range tf {
sum += t.size sum += t.size
} }
@ -132,7 +136,7 @@ func (tf tFiles) size() (sum uint64) {
// Searches smallest index of tables whose its smallest // Searches smallest index of tables whose its smallest
// key is after or equal with given key. // key is after or equal with given key.
func (tf tFiles) searchMin(icmp *iComparer, ikey iKey) int { func (tf tFiles) searchMin(icmp *iComparer, ikey internalKey) int {
return sort.Search(len(tf), func(i int) bool { return sort.Search(len(tf), func(i int) bool {
return icmp.Compare(tf[i].imin, ikey) >= 0 return icmp.Compare(tf[i].imin, ikey) >= 0
}) })
@ -140,7 +144,7 @@ func (tf tFiles) searchMin(icmp *iComparer, ikey iKey) int {
// Searches smallest index of tables whose its largest // Searches smallest index of tables whose its largest
// key is after or equal with given key. // key is after or equal with given key.
func (tf tFiles) searchMax(icmp *iComparer, ikey iKey) int { func (tf tFiles) searchMax(icmp *iComparer, ikey internalKey) int {
return sort.Search(len(tf), func(i int) bool { return sort.Search(len(tf), func(i int) bool {
return icmp.Compare(tf[i].imax, ikey) >= 0 return icmp.Compare(tf[i].imax, ikey) >= 0
}) })
@ -162,7 +166,7 @@ func (tf tFiles) overlaps(icmp *iComparer, umin, umax []byte, unsorted bool) boo
i := 0 i := 0
if len(umin) > 0 { if len(umin) > 0 {
// Find the earliest possible internal key for min. // Find the earliest possible internal key for min.
i = tf.searchMax(icmp, newIkey(umin, kMaxSeq, ktSeek)) i = tf.searchMax(icmp, makeInternalKey(nil, umin, keyMaxSeq, keyTypeSeek))
} }
if i >= len(tf) { if i >= len(tf) {
// Beginning of range is after all files, so no overlap. // Beginning of range is after all files, so no overlap.
@ -205,7 +209,7 @@ func (tf tFiles) getOverlaps(dst tFiles, icmp *iComparer, umin, umax []byte, ove
} }
// Returns tables key range. // Returns tables key range.
func (tf tFiles) getRange(icmp *iComparer) (imin, imax iKey) { func (tf tFiles) getRange(icmp *iComparer) (imin, imax internalKey) {
for i, t := range tf { for i, t := range tf {
if i == 0 { if i == 0 {
imin, imax = t.imin, t.imax imin, imax = t.imin, t.imax
@ -227,10 +231,10 @@ func (tf tFiles) newIndexIterator(tops *tOps, icmp *iComparer, slice *util.Range
if slice != nil { if slice != nil {
var start, limit int var start, limit int
if slice.Start != nil { if slice.Start != nil {
start = tf.searchMax(icmp, iKey(slice.Start)) start = tf.searchMax(icmp, internalKey(slice.Start))
} }
if slice.Limit != nil { if slice.Limit != nil {
limit = tf.searchMin(icmp, iKey(slice.Limit)) limit = tf.searchMin(icmp, internalKey(slice.Limit))
} else { } else {
limit = tf.Len() limit = tf.Len()
} }
@ -255,7 +259,7 @@ type tFilesArrayIndexer struct {
} }
func (a *tFilesArrayIndexer) Search(key []byte) int { func (a *tFilesArrayIndexer) Search(key []byte) int {
return a.searchMax(a.icmp, iKey(key)) return a.searchMax(a.icmp, internalKey(key))
} }
func (a *tFilesArrayIndexer) Get(i int) iterator.Iterator { func (a *tFilesArrayIndexer) Get(i int) iterator.Iterator {
@ -295,14 +299,14 @@ type tOps struct {
// Creates an empty table and returns table writer. // Creates an empty table and returns table writer.
func (t *tOps) create() (*tWriter, error) { func (t *tOps) create() (*tWriter, error) {
file := t.s.getTableFile(t.s.allocFileNum()) fd := storage.FileDesc{storage.TypeTable, t.s.allocFileNum()}
fw, err := file.Create() fw, err := t.s.stor.Create(fd)
if err != nil { if err != nil {
return nil, err return nil, err
} }
return &tWriter{ return &tWriter{
t: t, t: t,
file: file, fd: fd,
w: fw, w: fw,
tw: table.NewWriter(fw, t.s.o.Options), tw: table.NewWriter(fw, t.s.o.Options),
}, nil }, nil
@ -340,21 +344,20 @@ func (t *tOps) createFrom(src iterator.Iterator) (f *tFile, n int, err error) {
// Opens table. It returns a cache handle, which should // Opens table. It returns a cache handle, which should
// be released after use. // be released after use.
func (t *tOps) open(f *tFile) (ch *cache.Handle, err error) { func (t *tOps) open(f *tFile) (ch *cache.Handle, err error) {
num := f.file.Num() ch = t.cache.Get(0, uint64(f.fd.Num), func() (size int, value cache.Value) {
ch = t.cache.Get(0, num, func() (size int, value cache.Value) {
var r storage.Reader var r storage.Reader
r, err = f.file.Open() r, err = t.s.stor.Open(f.fd)
if err != nil { if err != nil {
return 0, nil return 0, nil
} }
var bcache *cache.CacheGetter var bcache *cache.NamespaceGetter
if t.bcache != nil { if t.bcache != nil {
bcache = &cache.CacheGetter{Cache: t.bcache, NS: num} bcache = &cache.NamespaceGetter{Cache: t.bcache, NS: uint64(f.fd.Num)}
} }
var tr *table.Reader var tr *table.Reader
tr, err = table.NewReader(r, int64(f.size), storage.NewFileInfo(f.file), bcache, t.bpool, t.s.o.Options) tr, err = table.NewReader(r, f.size, f.fd, bcache, t.bpool, t.s.o.Options)
if err != nil { if err != nil {
r.Close() r.Close()
return 0, nil return 0, nil
@ -390,14 +393,13 @@ func (t *tOps) findKey(f *tFile, key []byte, ro *opt.ReadOptions) (rkey []byte,
} }
// Returns approximate offset of the given key. // Returns approximate offset of the given key.
func (t *tOps) offsetOf(f *tFile, key []byte) (offset uint64, err error) { func (t *tOps) offsetOf(f *tFile, key []byte) (offset int64, err error) {
ch, err := t.open(f) ch, err := t.open(f)
if err != nil { if err != nil {
return return
} }
defer ch.Release() defer ch.Release()
offset_, err := ch.Value().(*table.Reader).OffsetOf(key) return ch.Value().(*table.Reader).OffsetOf(key)
return uint64(offset_), err
} }
// Creates an iterator from the given table. // Creates an iterator from the given table.
@ -414,15 +416,14 @@ func (t *tOps) newIterator(f *tFile, slice *util.Range, ro *opt.ReadOptions) ite
// Removes table from persistent storage. It waits until // Removes table from persistent storage. It waits until
// no one use the the table. // no one use the the table.
func (t *tOps) remove(f *tFile) { func (t *tOps) remove(f *tFile) {
num := f.file.Num() t.cache.Delete(0, uint64(f.fd.Num), func() {
t.cache.Delete(0, num, func() { if err := t.s.stor.Remove(f.fd); err != nil {
if err := f.file.Remove(); err != nil { t.s.logf("table@remove removing @%d %q", f.fd.Num, err)
t.s.logf("table@remove removing @%d %q", num, err)
} else { } else {
t.s.logf("table@remove removed @%d", num) t.s.logf("table@remove removed @%d", f.fd.Num)
} }
if t.bcache != nil { if t.bcache != nil {
t.bcache.EvictNS(num) t.bcache.EvictNS(uint64(f.fd.Num))
} }
}) })
} }
@ -471,7 +472,7 @@ func newTableOps(s *session) *tOps {
type tWriter struct { type tWriter struct {
t *tOps t *tOps
file storage.File fd storage.FileDesc
w storage.Writer w storage.Writer
tw *table.Writer tw *table.Writer
@ -513,16 +514,15 @@ func (w *tWriter) finish() (f *tFile, err error) {
return return
} }
} }
f = newTableFile(w.file, uint64(w.tw.BytesLen()), iKey(w.first), iKey(w.last)) f = newTableFile(w.fd, int64(w.tw.BytesLen()), internalKey(w.first), internalKey(w.last))
return return
} }
// Drops the table. // Drops the table.
func (w *tWriter) drop() { func (w *tWriter) drop() {
w.close() w.close()
w.file.Remove() w.t.s.stor.Remove(w.fd)
w.t.s.reuseFileNum(w.file.Num()) w.t.s.reuseFileNum(w.fd.Num)
w.file = nil
w.tw = nil w.tw = nil
w.first = nil w.first = nil
w.last = nil w.last = nil

View file

@ -507,9 +507,9 @@ func (i *indexIter) Get() iterator.Iterator {
// Reader is a table reader. // Reader is a table reader.
type Reader struct { type Reader struct {
mu sync.RWMutex mu sync.RWMutex
fi *storage.FileInfo fd storage.FileDesc
reader io.ReaderAt reader io.ReaderAt
cache *cache.CacheGetter cache *cache.NamespaceGetter
err error err error
bpool *util.BufferPool bpool *util.BufferPool
// Options // Options
@ -539,7 +539,7 @@ func (r *Reader) blockKind(bh blockHandle) string {
} }
func (r *Reader) newErrCorrupted(pos, size int64, kind, reason string) error { func (r *Reader) newErrCorrupted(pos, size int64, kind, reason string) error {
return &errors.ErrCorrupted{File: r.fi, Err: &ErrCorrupted{Pos: pos, Size: size, Kind: kind, Reason: reason}} return &errors.ErrCorrupted{Fd: r.fd, Err: &ErrCorrupted{Pos: pos, Size: size, Kind: kind, Reason: reason}}
} }
func (r *Reader) newErrCorruptedBH(bh blockHandle, reason string) error { func (r *Reader) newErrCorruptedBH(bh blockHandle, reason string) error {
@ -551,7 +551,7 @@ func (r *Reader) fixErrCorruptedBH(bh blockHandle, err error) error {
cerr.Pos = int64(bh.offset) cerr.Pos = int64(bh.offset)
cerr.Size = int64(bh.length) cerr.Size = int64(bh.length)
cerr.Kind = r.blockKind(bh) cerr.Kind = r.blockKind(bh)
return &errors.ErrCorrupted{File: r.fi, Err: cerr} return &errors.ErrCorrupted{Fd: r.fd, Err: cerr}
} }
return err return err
} }
@ -988,13 +988,13 @@ func (r *Reader) Release() {
// The fi, cache and bpool is optional and can be nil. // The fi, cache and bpool is optional and can be nil.
// //
// The returned table reader instance is goroutine-safe. // The returned table reader instance is goroutine-safe.
func NewReader(f io.ReaderAt, size int64, fi *storage.FileInfo, cache *cache.CacheGetter, bpool *util.BufferPool, o *opt.Options) (*Reader, error) { func NewReader(f io.ReaderAt, size int64, fd storage.FileDesc, cache *cache.NamespaceGetter, bpool *util.BufferPool, o *opt.Options) (*Reader, error) {
if f == nil { if f == nil {
return nil, errors.New("leveldb/table: nil file") return nil, errors.New("leveldb/table: nil file")
} }
r := &Reader{ r := &Reader{
fi: fi, fd: fd,
reader: f, reader: f,
cache: cache, cache: cache,
bpool: bpool, bpool: bpool,

View file

@ -72,20 +72,20 @@ func maxInt(a, b int) int {
return b return b
} }
type files []storage.File type fdSorter []storage.FileDesc
func (p files) Len() int { func (p fdSorter) Len() int {
return len(p) return len(p)
} }
func (p files) Less(i, j int) bool { func (p fdSorter) Less(i, j int) bool {
return p[i].Num() < p[j].Num() return p[i].Num < p[j].Num
} }
func (p files) Swap(i, j int) { func (p fdSorter) Swap(i, j int) {
p[i], p[j] = p[j], p[i] p[i], p[j] = p[j], p[i]
} }
func (p files) sort() { func sortFds(fds []storage.FileDesc) {
sort.Sort(p) sort.Sort(fdSorter(fds))
} }

View file

@ -7,38 +7,38 @@
package util package util
import ( import (
"bytes"
"encoding/binary" "encoding/binary"
) )
// Hash return hash of the given data. // Hash return hash of the given data.
func Hash(data []byte, seed uint32) uint32 { func Hash(data []byte, seed uint32) uint32 {
// Similar to murmur hash // Similar to murmur hash
var m uint32 = 0xc6a4a793 const (
var r uint32 = 24 m = uint32(0xc6a4a793)
h := seed ^ (uint32(len(data)) * m) r = uint32(24)
)
var (
h = seed ^ (uint32(len(data)) * m)
i int
)
buf := bytes.NewBuffer(data) for n := len(data) - len(data)%4; i < n; i += 4 {
for buf.Len() >= 4 { h += binary.LittleEndian.Uint32(data[i:])
var w uint32
binary.Read(buf, binary.LittleEndian, &w)
h += w
h *= m h *= m
h ^= (h >> 16) h ^= (h >> 16)
} }
rest := buf.Bytes() switch len(data) - i {
switch len(rest) {
default: default:
panic("not reached") panic("not reached")
case 3: case 3:
h += uint32(rest[2]) << 16 h += uint32(data[i+2]) << 16
fallthrough fallthrough
case 2: case 2:
h += uint32(rest[1]) << 8 h += uint32(data[i+1]) << 8
fallthrough fallthrough
case 1: case 1:
h += uint32(rest[0]) h += uint32(data[i])
h *= m h *= m
h ^= (h >> r) h ^= (h >> r)
case 0: case 0:

View file

@ -1,21 +0,0 @@
// Copyright (c) 2014, Suryandaru Triandana <syndtr@gmail.com>
// All rights reserved.
//
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// +build go1.3
package util
import (
"sync"
)
type Pool struct {
sync.Pool
}
func NewPool(cap int) *Pool {
return &Pool{}
}

View file

@ -1,33 +0,0 @@
// Copyright (c) 2014, Suryandaru Triandana <syndtr@gmail.com>
// All rights reserved.
//
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// +build !go1.3
package util
type Pool struct {
pool chan interface{}
}
func (p *Pool) Get() interface{} {
select {
case x := <-p.pool:
return x
default:
return nil
}
}
func (p *Pool) Put(x interface{}) {
select {
case p.pool <- x:
default:
}
}
func NewPool(cap int) *Pool {
return &Pool{pool: make(chan interface{}, cap)}
}

View file

@ -7,6 +7,7 @@
package leveldb package leveldb
import ( import (
"fmt"
"sync/atomic" "sync/atomic"
"unsafe" "unsafe"
@ -23,7 +24,7 @@ type tSet struct {
type version struct { type version struct {
s *session s *session
tables []tFiles levels []tFiles
// Level that should be compacted next and its compaction score. // Level that should be compacted next and its compaction score.
// Score < 1 means compaction is not strictly needed. These fields // Score < 1 means compaction is not strictly needed. These fields
@ -39,7 +40,7 @@ type version struct {
} }
func newVersion(s *session) *version { func newVersion(s *session) *version {
return &version{s: s, tables: make([]tFiles, s.o.GetNumLevel())} return &version{s: s}
} }
func (v *version) releaseNB() { func (v *version) releaseNB() {
@ -51,18 +52,18 @@ func (v *version) releaseNB() {
panic("negative version ref") panic("negative version ref")
} }
tables := make(map[uint64]bool) nextTables := make(map[int64]bool)
for _, tt := range v.next.tables { for _, tt := range v.next.levels {
for _, t := range tt { for _, t := range tt {
num := t.file.Num() num := t.fd.Num
tables[num] = true nextTables[num] = true
} }
} }
for _, tt := range v.tables { for _, tt := range v.levels {
for _, t := range tt { for _, t := range tt {
num := t.file.Num() num := t.fd.Num
if _, ok := tables[num]; !ok { if _, ok := nextTables[num]; !ok {
v.s.tops.remove(t) v.s.tops.remove(t)
} }
} }
@ -78,11 +79,26 @@ func (v *version) release() {
v.s.vmu.Unlock() v.s.vmu.Unlock()
} }
func (v *version) walkOverlapping(ikey iKey, f func(level int, t *tFile) bool, lf func(level int) bool) { func (v *version) walkOverlapping(aux tFiles, ikey internalKey, f func(level int, t *tFile) bool, lf func(level int) bool) {
ukey := ikey.ukey() ukey := ikey.ukey()
// Aux level.
if aux != nil {
for _, t := range aux {
if t.overlaps(v.s.icmp, ukey, ukey) {
if !f(-1, t) {
return
}
}
}
if lf != nil && !lf(-1) {
return
}
}
// Walk tables level-by-level. // Walk tables level-by-level.
for level, tables := range v.tables { for level, tables := range v.levels {
if len(tables) == 0 { if len(tables) == 0 {
continue continue
} }
@ -114,7 +130,7 @@ func (v *version) walkOverlapping(ikey iKey, f func(level int, t *tFile) bool, l
} }
} }
func (v *version) get(ikey iKey, ro *opt.ReadOptions, noValue bool) (value []byte, tcomp bool, err error) { func (v *version) get(aux tFiles, ikey internalKey, ro *opt.ReadOptions, noValue bool) (value []byte, tcomp bool, err error) {
ukey := ikey.ukey() ukey := ikey.ukey()
var ( var (
@ -124,16 +140,16 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions, noValue bool) (value []byt
// Level-0. // Level-0.
zfound bool zfound bool
zseq uint64 zseq uint64
zkt kType zkt keyType
zval []byte zval []byte
) )
err = ErrNotFound err = ErrNotFound
// Since entries never hope across level, finding key/value // Since entries never hop across level, finding key/value
// in smaller level make later levels irrelevant. // in smaller level make later levels irrelevant.
v.walkOverlapping(ikey, func(level int, t *tFile) bool { v.walkOverlapping(aux, ikey, func(level int, t *tFile) bool {
if !tseek { if level >= 0 && !tseek {
if tset == nil { if tset == nil {
tset = &tSet{level, t} tset = &tSet{level, t}
} else { } else {
@ -150,6 +166,7 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions, noValue bool) (value []byt
} else { } else {
fikey, fval, ferr = v.s.tops.find(t, ikey, ro) fikey, fval, ferr = v.s.tops.find(t, ikey, ro)
} }
switch ferr { switch ferr {
case nil: case nil:
case ErrNotFound: case ErrNotFound:
@ -159,9 +176,10 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions, noValue bool) (value []byt
return false return false
} }
if fukey, fseq, fkt, fkerr := parseIkey(fikey); fkerr == nil { if fukey, fseq, fkt, fkerr := parseInternalKey(fikey); fkerr == nil {
if v.s.icmp.uCompare(ukey, fukey) == 0 { if v.s.icmp.uCompare(ukey, fukey) == 0 {
if level == 0 { // Level <= 0 may overlaps each-other.
if level <= 0 {
if fseq >= zseq { if fseq >= zseq {
zfound = true zfound = true
zseq = fseq zseq = fseq
@ -170,12 +188,12 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions, noValue bool) (value []byt
} }
} else { } else {
switch fkt { switch fkt {
case ktVal: case keyTypeVal:
value = fval value = fval
err = nil err = nil
case ktDel: case keyTypeDel:
default: default:
panic("leveldb: invalid iKey type") panic("leveldb: invalid internalKey type")
} }
return false return false
} }
@ -189,12 +207,12 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions, noValue bool) (value []byt
}, func(level int) bool { }, func(level int) bool {
if zfound { if zfound {
switch zkt { switch zkt {
case ktVal: case keyTypeVal:
value = zval value = zval
err = nil err = nil
case ktDel: case keyTypeDel:
default: default:
panic("leveldb: invalid iKey type") panic("leveldb: invalid internalKey type")
} }
return false return false
} }
@ -209,46 +227,40 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions, noValue bool) (value []byt
return return
} }
func (v *version) sampleSeek(ikey iKey) (tcomp bool) { func (v *version) sampleSeek(ikey internalKey) (tcomp bool) {
var tset *tSet var tset *tSet
v.walkOverlapping(ikey, func(level int, t *tFile) bool { v.walkOverlapping(nil, ikey, func(level int, t *tFile) bool {
if tset == nil { if tset == nil {
tset = &tSet{level, t} tset = &tSet{level, t}
return true return true
} else { }
if tset.table.consumeSeek() <= 0 { if tset.table.consumeSeek() <= 0 {
tcomp = atomic.CompareAndSwapPointer(&v.cSeek, nil, unsafe.Pointer(tset)) tcomp = atomic.CompareAndSwapPointer(&v.cSeek, nil, unsafe.Pointer(tset))
} }
return false return false
}
}, nil) }, nil)
return return
} }
func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []iterator.Iterator) { func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []iterator.Iterator) {
// Merge all level zero files together since they may overlap
for _, t := range v.tables[0] {
it := v.s.tops.newIterator(t, slice, ro)
its = append(its, it)
}
strict := opt.GetStrict(v.s.o.Options, ro, opt.StrictReader) strict := opt.GetStrict(v.s.o.Options, ro, opt.StrictReader)
for _, tables := range v.tables[1:] { for level, tables := range v.levels {
if len(tables) == 0 { if level == 0 {
continue // Merge all level zero files together since they may overlap.
for _, t := range tables {
its = append(its, v.s.tops.newIterator(t, slice, ro))
}
} else if len(tables) != 0 {
its = append(its, iterator.NewIndexedIterator(tables.newIndexIterator(v.s.tops, v.s.icmp, slice, ro), strict))
} }
it := iterator.NewIndexedIterator(tables.newIndexIterator(v.s.tops, v.s.icmp, slice, ro), strict)
its = append(its, it)
} }
return return
} }
func (v *version) newStaging() *versionStaging { func (v *version) newStaging() *versionStaging {
return &versionStaging{base: v, tables: make([]tablesScratch, v.s.o.GetNumLevel())} return &versionStaging{base: v}
} }
// Spawn a new version based on this version. // Spawn a new version based on this version.
@ -259,19 +271,22 @@ func (v *version) spawn(r *sessionRecord) *version {
} }
func (v *version) fillRecord(r *sessionRecord) { func (v *version) fillRecord(r *sessionRecord) {
for level, ts := range v.tables { for level, tables := range v.levels {
for _, t := range ts { for _, t := range tables {
r.addTableFile(level, t) r.addTableFile(level, t)
} }
} }
} }
func (v *version) tLen(level int) int { func (v *version) tLen(level int) int {
return len(v.tables[level]) if level < len(v.levels) {
return len(v.levels[level])
}
return 0
} }
func (v *version) offsetOf(ikey iKey) (n uint64, err error) { func (v *version) offsetOf(ikey internalKey) (n int64, err error) {
for level, tables := range v.tables { for level, tables := range v.levels {
for _, t := range tables { for _, t := range tables {
if v.s.icmp.Compare(t.imax, ikey) <= 0 { if v.s.icmp.Compare(t.imax, ikey) <= 0 {
// Entire file is before "ikey", so just add the file size // Entire file is before "ikey", so just add the file size
@ -287,12 +302,11 @@ func (v *version) offsetOf(ikey iKey) (n uint64, err error) {
} else { } else {
// "ikey" falls in the range for this table. Add the // "ikey" falls in the range for this table. Add the
// approximate offset of "ikey" within the table. // approximate offset of "ikey" within the table.
var nn uint64 if m, err := v.s.tops.offsetOf(t, ikey); err == nil {
nn, err = v.s.tops.offsetOf(t, ikey) n += m
if err != nil { } else {
return 0, err return 0, err
} }
n += nn
} }
} }
} }
@ -300,37 +314,50 @@ func (v *version) offsetOf(ikey iKey) (n uint64, err error) {
return return
} }
func (v *version) pickMemdbLevel(umin, umax []byte) (level int) { func (v *version) pickMemdbLevel(umin, umax []byte, maxLevel int) (level int) {
if !v.tables[0].overlaps(v.s.icmp, umin, umax, true) { if maxLevel > 0 {
if len(v.levels) == 0 {
return maxLevel
}
if !v.levels[0].overlaps(v.s.icmp, umin, umax, true) {
var overlaps tFiles var overlaps tFiles
maxLevel := v.s.o.GetMaxMemCompationLevel()
for ; level < maxLevel; level++ { for ; level < maxLevel; level++ {
if v.tables[level+1].overlaps(v.s.icmp, umin, umax, false) { if pLevel := level + 1; pLevel >= len(v.levels) {
return maxLevel
} else if v.levels[pLevel].overlaps(v.s.icmp, umin, umax, false) {
break break
} }
overlaps = v.tables[level+2].getOverlaps(overlaps, v.s.icmp, umin, umax, false) if gpLevel := level + 2; gpLevel < len(v.levels) {
if overlaps.size() > uint64(v.s.o.GetCompactionGPOverlaps(level)) { overlaps = v.levels[gpLevel].getOverlaps(overlaps, v.s.icmp, umin, umax, false)
if overlaps.size() > int64(v.s.o.GetCompactionGPOverlaps(level)) {
break break
} }
} }
} }
}
}
return return
} }
func (v *version) computeCompaction() { func (v *version) computeCompaction() {
// Precomputed best level for next compaction // Precomputed best level for next compaction
var bestLevel int = -1 bestLevel := int(-1)
var bestScore float64 = -1 bestScore := float64(-1)
for level, tables := range v.tables { statFiles := make([]int, len(v.levels))
statSizes := make([]string, len(v.levels))
statScore := make([]string, len(v.levels))
statTotSize := int64(0)
for level, tables := range v.levels {
var score float64 var score float64
size := tables.size()
if level == 0 { if level == 0 {
// We treat level-0 specially by bounding the number of files // We treat level-0 specially by bounding the number of files
// instead of number of bytes for two reasons: // instead of number of bytes for two reasons:
// //
// (1) With larger write-buffer sizes, it is nice not to do too // (1) With larger write-buffer sizes, it is nice not to do too
// many level-0 compactions. // many level-0 compaction.
// //
// (2) The files in level-0 are merged on every read and // (2) The files in level-0 are merged on every read and
// therefore we wish to avoid too many files when the individual // therefore we wish to avoid too many files when the individual
@ -339,17 +366,24 @@ func (v *version) computeCompaction() {
// overwrites/deletions). // overwrites/deletions).
score = float64(len(tables)) / float64(v.s.o.GetCompactionL0Trigger()) score = float64(len(tables)) / float64(v.s.o.GetCompactionL0Trigger())
} else { } else {
score = float64(tables.size()) / float64(v.s.o.GetCompactionTotalSize(level)) score = float64(size) / float64(v.s.o.GetCompactionTotalSize(level))
} }
if score > bestScore { if score > bestScore {
bestLevel = level bestLevel = level
bestScore = score bestScore = score
} }
statFiles[level] = len(tables)
statSizes[level] = shortenb(int(size))
statScore[level] = fmt.Sprintf("%.2f", score)
statTotSize += size
} }
v.cLevel = bestLevel v.cLevel = bestLevel
v.cScore = bestScore v.cScore = bestScore
v.s.logf("version@stat F·%v S·%s%v Sc·%v", statFiles, shortenb(int(statTotSize)), statSizes, statScore)
} }
func (v *version) needCompaction() bool { func (v *version) needCompaction() bool {
@ -357,43 +391,48 @@ func (v *version) needCompaction() bool {
} }
type tablesScratch struct { type tablesScratch struct {
added map[uint64]atRecord added map[int64]atRecord
deleted map[uint64]struct{} deleted map[int64]struct{}
} }
type versionStaging struct { type versionStaging struct {
base *version base *version
tables []tablesScratch levels []tablesScratch
}
func (p *versionStaging) getScratch(level int) *tablesScratch {
if level >= len(p.levels) {
newLevels := make([]tablesScratch, level+1)
copy(newLevels, p.levels)
p.levels = newLevels
}
return &(p.levels[level])
} }
func (p *versionStaging) commit(r *sessionRecord) { func (p *versionStaging) commit(r *sessionRecord) {
// Deleted tables. // Deleted tables.
for _, r := range r.deletedTables { for _, r := range r.deletedTables {
tm := &(p.tables[r.level]) scratch := p.getScratch(r.level)
if r.level < len(p.base.levels) && len(p.base.levels[r.level]) > 0 {
if len(p.base.tables[r.level]) > 0 { if scratch.deleted == nil {
if tm.deleted == nil { scratch.deleted = make(map[int64]struct{})
tm.deleted = make(map[uint64]struct{})
} }
tm.deleted[r.num] = struct{}{} scratch.deleted[r.num] = struct{}{}
} }
if scratch.added != nil {
if tm.added != nil { delete(scratch.added, r.num)
delete(tm.added, r.num)
} }
} }
// New tables. // New tables.
for _, r := range r.addedTables { for _, r := range r.addedTables {
tm := &(p.tables[r.level]) scratch := p.getScratch(r.level)
if scratch.added == nil {
if tm.added == nil { scratch.added = make(map[int64]atRecord)
tm.added = make(map[uint64]atRecord)
} }
tm.added[r.num] = r scratch.added[r.num] = r
if scratch.deleted != nil {
if tm.deleted != nil { delete(scratch.deleted, r.num)
delete(tm.deleted, r.num)
} }
} }
} }
@ -401,39 +440,62 @@ func (p *versionStaging) commit(r *sessionRecord) {
func (p *versionStaging) finish() *version { func (p *versionStaging) finish() *version {
// Build new version. // Build new version.
nv := newVersion(p.base.s) nv := newVersion(p.base.s)
for level, tm := range p.tables { numLevel := len(p.levels)
btables := p.base.tables[level] if len(p.base.levels) > numLevel {
numLevel = len(p.base.levels)
n := len(btables) + len(tm.added) - len(tm.deleted) }
if n < 0 { nv.levels = make([]tFiles, numLevel)
n = 0 for level := 0; level < numLevel; level++ {
var baseTabels tFiles
if level < len(p.base.levels) {
baseTabels = p.base.levels[level]
}
if level < len(p.levels) {
scratch := p.levels[level]
var nt tFiles
// Prealloc list if possible.
if n := len(baseTabels) + len(scratch.added) - len(scratch.deleted); n > 0 {
nt = make(tFiles, 0, n)
} }
nt := make(tFiles, 0, n)
// Base tables. // Base tables.
for _, t := range btables { for _, t := range baseTabels {
if _, ok := tm.deleted[t.file.Num()]; ok { if _, ok := scratch.deleted[t.fd.Num]; ok {
continue continue
} }
if _, ok := tm.added[t.file.Num()]; ok { if _, ok := scratch.added[t.fd.Num]; ok {
continue continue
} }
nt = append(nt, t) nt = append(nt, t)
} }
// New tables. // New tables.
for _, r := range tm.added { for _, r := range scratch.added {
nt = append(nt, p.base.s.tableFileFromRecord(r)) nt = append(nt, tableFileFromRecord(r))
} }
if len(nt) != 0 {
// Sort tables. // Sort tables.
if level == 0 { if level == 0 {
nt.sortByNum() nt.sortByNum()
} else { } else {
nt.sortByKey(p.base.s.icmp) nt.sortByKey(p.base.s.icmp)
} }
nv.tables[level] = nt
nv.levels[level] = nt
} }
} else {
nv.levels[level] = baseTabels
}
}
// Trim levels.
n := len(nv.levels)
for ; n > 0 && nv.levels[n-1] == nil; n-- {
}
nv.levels = nv.levels[:n]
// Compute compaction score for new version. // Compute compaction score for new version.
nv.computeCompaction() nv.computeCompaction()

60
vendor/vendor.json vendored
View file

@ -207,64 +207,76 @@
"revisionTime": "2015-08-17T10:50:50-07:00" "revisionTime": "2015-08-17T10:50:50-07:00"
}, },
{ {
"checksumSHA1": "sUPlrnoPPmYuvjEtw9HUTKPCZa4=",
"path": "github.com/syndtr/goleveldb/leveldb", "path": "github.com/syndtr/goleveldb/leveldb",
"revision": "1a9d62f03ea92815b46fcaab357cfd4df264b1a0", "revision": "ab8b5dcf1042e818ab68e770d465112a899b668e",
"revisionTime": "2015-08-19T12:16:22+07:00" "revisionTime": "2016-06-29T10:12:33Z"
}, },
{ {
"checksumSHA1": "BX+u3k6if9kZNYYqbL56gC48BAQ=",
"path": "github.com/syndtr/goleveldb/leveldb/cache", "path": "github.com/syndtr/goleveldb/leveldb/cache",
"revision": "1a9d62f03ea92815b46fcaab357cfd4df264b1a0", "revision": "ab8b5dcf1042e818ab68e770d465112a899b668e",
"revisionTime": "2015-08-19T12:16:22+07:00" "revisionTime": "2016-06-29T10:12:33Z"
}, },
{ {
"checksumSHA1": "5KPgnvCPlR0ysDAqo6jApzRQ3tw=",
"path": "github.com/syndtr/goleveldb/leveldb/comparer", "path": "github.com/syndtr/goleveldb/leveldb/comparer",
"revision": "1a9d62f03ea92815b46fcaab357cfd4df264b1a0", "revision": "ab8b5dcf1042e818ab68e770d465112a899b668e",
"revisionTime": "2015-08-19T12:16:22+07:00" "revisionTime": "2016-06-29T10:12:33Z"
}, },
{ {
"checksumSHA1": "Vpvz4qmbq/kz0SN95yt0tmSI7JE=",
"path": "github.com/syndtr/goleveldb/leveldb/errors", "path": "github.com/syndtr/goleveldb/leveldb/errors",
"revision": "1a9d62f03ea92815b46fcaab357cfd4df264b1a0", "revision": "ab8b5dcf1042e818ab68e770d465112a899b668e",
"revisionTime": "2015-08-19T12:16:22+07:00" "revisionTime": "2016-06-29T10:12:33Z"
}, },
{ {
"checksumSHA1": "eqKeD6DS7eNCtxVYZEHHRKkyZrw=",
"path": "github.com/syndtr/goleveldb/leveldb/filter", "path": "github.com/syndtr/goleveldb/leveldb/filter",
"revision": "1a9d62f03ea92815b46fcaab357cfd4df264b1a0", "revision": "ab8b5dcf1042e818ab68e770d465112a899b668e",
"revisionTime": "2015-08-19T12:16:22+07:00" "revisionTime": "2016-06-29T10:12:33Z"
}, },
{ {
"checksumSHA1": "cRn09EwfU3k2ZjvClHYmVFlakRY=",
"path": "github.com/syndtr/goleveldb/leveldb/iterator", "path": "github.com/syndtr/goleveldb/leveldb/iterator",
"revision": "1a9d62f03ea92815b46fcaab357cfd4df264b1a0", "revision": "ab8b5dcf1042e818ab68e770d465112a899b668e",
"revisionTime": "2015-08-19T12:16:22+07:00" "revisionTime": "2016-06-29T10:12:33Z"
}, },
{ {
"checksumSHA1": "CMBbso8ZuG2kBGDL2Blf/wpeheU=",
"path": "github.com/syndtr/goleveldb/leveldb/journal", "path": "github.com/syndtr/goleveldb/leveldb/journal",
"revision": "1a9d62f03ea92815b46fcaab357cfd4df264b1a0", "revision": "ab8b5dcf1042e818ab68e770d465112a899b668e",
"revisionTime": "2015-08-19T12:16:22+07:00" "revisionTime": "2016-06-29T10:12:33Z"
}, },
{ {
"checksumSHA1": "LshzRv+3spfwuHLepRxiyjf/3sQ=",
"path": "github.com/syndtr/goleveldb/leveldb/memdb", "path": "github.com/syndtr/goleveldb/leveldb/memdb",
"revision": "1a9d62f03ea92815b46fcaab357cfd4df264b1a0", "revision": "ab8b5dcf1042e818ab68e770d465112a899b668e",
"revisionTime": "2015-08-19T12:16:22+07:00" "revisionTime": "2016-06-29T10:12:33Z"
}, },
{ {
"checksumSHA1": "MP/sSiEbzIN5M664sO4r9+dwzV4=",
"path": "github.com/syndtr/goleveldb/leveldb/opt", "path": "github.com/syndtr/goleveldb/leveldb/opt",
"revision": "1a9d62f03ea92815b46fcaab357cfd4df264b1a0", "revision": "ab8b5dcf1042e818ab68e770d465112a899b668e",
"revisionTime": "2015-08-19T12:16:22+07:00" "revisionTime": "2016-06-29T10:12:33Z"
}, },
{ {
"checksumSHA1": "XO5e4bQsWDdNqoHbFWy2TKoOWrQ=",
"path": "github.com/syndtr/goleveldb/leveldb/storage", "path": "github.com/syndtr/goleveldb/leveldb/storage",
"revision": "1a9d62f03ea92815b46fcaab357cfd4df264b1a0", "revision": "ab8b5dcf1042e818ab68e770d465112a899b668e",
"revisionTime": "2015-08-19T12:16:22+07:00" "revisionTime": "2016-06-29T10:12:33Z"
}, },
{ {
"checksumSHA1": "4EGplyU1Q07vIczP2yZgKvjuYVA=",
"path": "github.com/syndtr/goleveldb/leveldb/table", "path": "github.com/syndtr/goleveldb/leveldb/table",
"revision": "1a9d62f03ea92815b46fcaab357cfd4df264b1a0", "revision": "ab8b5dcf1042e818ab68e770d465112a899b668e",
"revisionTime": "2015-08-19T12:16:22+07:00" "revisionTime": "2016-06-29T10:12:33Z"
}, },
{ {
"checksumSHA1": "4zil8Gwg8VPkDn1YzlgCvtukJFU=",
"path": "github.com/syndtr/goleveldb/leveldb/util", "path": "github.com/syndtr/goleveldb/leveldb/util",
"revision": "1a9d62f03ea92815b46fcaab357cfd4df264b1a0", "revision": "ab8b5dcf1042e818ab68e770d465112a899b668e",
"revisionTime": "2015-08-19T12:16:22+07:00" "revisionTime": "2016-06-29T10:12:33Z"
}, },
{ {
"path": "github.com/vaughan0/go-ini", "path": "github.com/vaughan0/go-ini",