package pages import ( "errors" "fmt" "hash/fnv" "math" "os" "runtime" "sync" "time" "unsafe" ) // These errors can be returned when opening or calling methods on a DB. var ( // ErrDatabaseNotOpen is returned when a DB instance is accessed before it // is opened or after it is closed. ErrDatabaseNotOpen = errors.New("database not open") // ErrDatabaseOpen is returned when opening a database that is // already open. ErrDatabaseOpen = errors.New("database already open") // ErrInvalid is returned when both meta pages on a database are invalid. // This typically occurs when a file is not a bolt database. ErrInvalid = errors.New("invalid database") // ErrVersionMismatch is returned when the data file was created with a // different version of Bolt. ErrVersionMismatch = errors.New("version mismatch") // ErrChecksum is returned when either meta page checksum does not match. ErrChecksum = errors.New("checksum error") // ErrTimeout is returned when a database cannot obtain an exclusive lock // on the data file after the timeout passed to Open(). ErrTimeout = errors.New("timeout") // ErrNotFound is returned when a user page for an ID could not be found. ErrNotFound = errors.New("not found") ErrTxClosed = errors.New("transaction closed") ErrTxNotWritable = errors.New("transaction not writable") ) // Marker value that indicates that a file is a pagebuf file. const magic uint32 = 0xAFFEAFFE // The data file version. const version = 1 // The largest step that can be taken when remapping the mmap. const maxMmapStep = 1 << 30 // 1GB // defaultPageSize of the underlying buffers is set to the OS page size. var defaultPageSize = os.Getpagesize() // DB is an interface providing access to persistent byte chunks that // are backed by memory-mapped pages. type DB struct { // If you want to read the entire database fast, you can set MmapFlag to // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead. MmapFlags int // AllocSize is the amount of space allocated when the database // needs to create new pages. This is done to amortize the cost // of truncate() and fsync() when growing the data file. AllocSize int path string // location of the pagebuf file file *os.File // the opened file of path opened bool data *[maxMapSize]byte dataref []byte // mmap'ed readonly, write throws SEGV datasz int filesz int // current on disk file size pageSize int meta0 *meta meta1 *meta freelist *freelist rwtx *Tx txs []*Tx pagePool sync.Pool rwlock sync.Mutex // Allows only one writer at a time. metalock sync.Mutex // Protects meta page access. mmaplock sync.RWMutex // Protects mmap access during remapping ops struct { writeAt func(b []byte, off int64) (n int, err error) } } // Options defines configuration parameters with which a PageBuf is initialized. type Options struct { // Timeout is the amount of time to wait to obtain a file lock. // When set to zero it will wait indefinitely. This option is only // available on Darwin and Linux. Timeout time.Duration // Sets the DB.MmapFlags flag before memory mapping the file. MmapFlags int // XXX(fabxc): potentially allow setting different allocation strategies // to fit different use cases. // InitialMmapSize is the initial mmap size of the database // in bytes. // // If <=0, the initial map size is 0. // If initialMmapSize is smaller than the previous database size, // it takes no effect. InitialMmapSize int // PageSize defines a custom page size used. It cannot be changed later. // Must be a multiple of the operating system's default page size. PageSize int } // DefaultOptions specifies a set of default parameters used when a pagebuf // is opened without explicit options. var DefaultOptions = Options{ // Use the OS's default page size. PageSize: defaultPageSize, } // Default values if not set in a DB instance. const ( DefaultAllocSize = 16 * 1024 * 1024 ) // Open and create a new database under the given path. func Open(path string, mode os.FileMode, o *Options) (*DB, error) { db := &DB{ opened: true, } // Set default options if no options are provided. if o == nil { o = &DefaultOptions } db.MmapFlags = o.MmapFlags db.AllocSize = DefaultAllocSize flag := os.O_RDWR // Open data file and separate sync handler for metadata writes. db.path = path var err error if db.file, err = os.OpenFile(db.path, flag|os.O_CREATE, mode); err != nil { _ = db.close() return nil, err } // Lock file so that other processes using pagebuf in read-write mode cannot // use the underlying data at the same time. if err := flock(db, mode, true, o.Timeout); err != nil { _ = db.close() return nil, err } // Default values for test hooks db.ops.writeAt = db.file.WriteAt // Initialize the database if it doesn't exist. if info, err := db.file.Stat(); err != nil { return nil, err } else if info.Size() == 0 { // Initialize new files with meta pages. if err := db.init(o.PageSize); err != nil { return nil, err } } else { // Read the first meta page to determine the page size. var buf [0x1000]byte if _, err := db.file.ReadAt(buf[:], 0); err == nil { m := db.pageInBuffer(buf[:], 0).meta() if err := m.validate(); err != nil { // We cannot verify which page sizes are used. return nil, fmt.Errorf("cannot read page size: %s", err) } else { db.pageSize = int(m.pageSize) } } else { return nil, fmt.Errorf("reading first meta page failed: %s", err) } } // Initialize page pool. db.pagePool = sync.Pool{ New: func() interface{} { return make([]byte, db.pageSize) }, } // Memory map the data file. if err := db.mmap(o.InitialMmapSize); err != nil { _ = db.close() return nil, err } // Read in the freelist. db.freelist = newFreelist() db.freelist.read(db.page(db.meta().freelist)) // Mark the database as opened and return. return db, nil } func validatePageSize(psz int) error { // Max value the content length can hold. if defaultPageSize > math.MaxUint16 { return fmt.Errorf("invalid page size %d", psz) } // Page size must be a multiple of OS page size so we stay // page aligned. if psz < defaultPageSize { if defaultPageSize%psz != 0 { return fmt.Errorf("invalid page size %d", psz) } } else if psz > defaultPageSize { if psz%defaultPageSize != 0 { return fmt.Errorf("invalid page size %d", psz) } } return nil } // init creates a new database file and initializes its meta pages. func (db *DB) init(psz int) error { if err := validatePageSize(psz); err != nil { return err } // Set the page size to the OS page size. db.pageSize = psz // Create two meta pages on a buffer. buf := make([]byte, db.pageSize*4) for i := 0; i < 2; i++ { p := db.pageInBuffer(buf[:], pgid(i)) p.id = pgid(i) p.flags = pageFlagMeta // Initialize the meta page. m := p.meta() m.magic = magic m.version = version m.pageSize = uint32(db.pageSize) m.freelist = 2 m.txid = txid(i) m.pgid = 4 // TODO(fabxc): we initialize with zero pages, what to do here? m.checksum = m.sum64() } // Write an empty freelist at page 3. p := db.pageInBuffer(buf[:], pgid(2)) p.id = pgid(2) p.flags = pageFlagFreelist p.count = 0 // Write the first empty page. p = db.pageInBuffer(buf[:], pgid(3)) p.id = pgid(3) p.flags = pageFlagData p.count = 0 // Write the buffer to our data file. if _, err := db.ops.writeAt(buf, 0); err != nil { return err } if err := fdatasync(db); err != nil { return err } return nil } // Sync executes fdatasync() against the database file handle. func (db *DB) Sync() error { return fdatasync(db) } // Close synchronizes and closes the memory-mapped pagebuf file. func (db *DB) Close() error { db.rwlock.Lock() defer db.rwlock.Unlock() db.metalock.Lock() defer db.metalock.Unlock() db.mmaplock.RLock() defer db.mmaplock.RUnlock() return db.close() } func (db *DB) close() error { if !db.opened { return nil } db.opened = false db.freelist = nil db.ops.writeAt = nil // Close the mmap. if err := db.munmap(); err != nil { return err } // Close file handles. if db.file != nil { // Close the file descriptor. if err := db.file.Close(); err != nil { return fmt.Errorf("db file close: %s", err) } db.file = nil } db.path = "" return nil } // Update executes a function within the context of a read-write managed transaction. // If no error is returned from the function then the transaction is committed. // If an error is returned then the entire transaction is rolled back. // Any error that is returned from the function or returned from the commit is // returned from the Update() method. // // Attempting to manually commit or rollback within the function will cause a panic. func (db *DB) Update(fn func(*Tx) error) error { t, err := db.Begin(true) if err != nil { return err } // Make sure the transaction rolls back in the event of a panic. defer func() { if t.db != nil { t.rollback() } }() // Mark as a managed tx so that the inner function cannot manually commit. t.managed = true // If an error is returned from the function then rollback and return error. err = fn(t) t.managed = false if err != nil { _ = t.Rollback() return err } return t.Commit() } // View executes a function within the context of a managed read-only transaction. // Any error that is returned from the function is returned from the View() method. // // Attempting to manually rollback within the function will cause a panic. func (db *DB) View(fn func(*Tx) error) error { t, err := db.Begin(false) if err != nil { return err } // Make sure the transaction rolls back in the event of a panic. defer func() { if t.db != nil { t.rollback() } }() // Mark as a managed tx so that the inner function cannot manually rollback. t.managed = true // If an error is returned from the function then pass it through. err = fn(t) t.managed = false if err != nil { _ = t.Rollback() return err } if err := t.Rollback(); err != nil { return err } return nil } // pageExists checks whether the page with the given id exists. func (db *DB) pageExists(id pgid) bool { // The page exists if it is not in the freelist or out of the data range. return !db.freelist.cache[pgid(id)] && int(id+1)*db.pageSize < db.datasz } // page retrieves a page reference from the mmap based on the current page size. func (db *DB) page(id pgid) *page { pos := id * pgid(db.pageSize) return (*page)(unsafe.Pointer(&db.data[pos])) } // pageInBuffer retrieves a page reference from a given byte array based on the current // page size. func (db *DB) pageInBuffer(b []byte, id pgid) *page { pos := id * pgid(db.pageSize) return (*page)(unsafe.Pointer(&b[pos])) } // meta retrieves the current meta page reference. func (db *DB) meta() *meta { // We have to return the meta with the highest txid which doesn't fail // validation. Otherwise, we can cause errors when in fact the database is // in a consistent state. metaA is the one with the higher txid. metaA := db.meta0 metaB := db.meta1 if db.meta1.txid > db.meta0.txid { metaA = db.meta1 metaB = db.meta0 } // Use higher meta page if valid. Otherwise fallback to previous, if valid. if err := metaA.validate(); err == nil { return metaA } else if err := metaB.validate(); err == nil { return metaB } // This should never be reached, because both meta1 and meta0 were validated // on mmap() and we do fsync() on every write. panic("pagebuf.PageBuf.meta(): invalid meta pages") } // allocate returns a contiguous block of memory starting at a given page. func (db *DB) allocate(count int) (*page, error) { // Allocate a temporary buffer for the page. var buf []byte if count == 1 { buf = db.pagePool.Get().([]byte) } else { buf = make([]byte, count*db.pageSize) } p := (*page)(unsafe.Pointer(&buf[0])) p.overflow = uint32(count - 1) // Use pages from the freelist if they are available. if p.id = db.freelist.allocate(count); p.id != 0 { return p, nil } // Resize mmap() if we're at the end. p.id = db.rwtx.meta.pgid var minsz = int((p.id+pgid(count))+1) * db.pageSize if minsz >= db.datasz { if err := db.mmap(minsz); err != nil { return nil, fmt.Errorf("mmap allocate error: %s", err) } } // Move the page id high water mark. db.rwtx.meta.pgid += pgid(count) return p, nil } // grow grows the size of the database to the given sz. func (db *DB) grow(sz int) error { // Ignore if the new size is less than available file size. if sz <= db.filesz { return nil } // If the data is smaller than the alloc size then only allocate what's needed. // Once it goes over the allocation size then allocate in chunks. if db.datasz < db.AllocSize { sz = db.datasz } else { sz += db.AllocSize } // Truncate and fsync to ensure file size metadata is flushed. // https://github.com/boltdb/bolt/issues/284 if runtime.GOOS != "windows" { if err := db.file.Truncate(int64(sz)); err != nil { return fmt.Errorf("file resize error: %s", err) } } if err := db.file.Sync(); err != nil { return fmt.Errorf("file sync error: %s", err) } db.filesz = sz return nil } // mmap opens the underlying memory-mapped file and initializes it. // minsz is the minimum size that the mmap can be. func (db *DB) mmap(minsz int) error { db.mmaplock.Lock() defer db.mmaplock.Unlock() info, err := db.file.Stat() if err != nil { return fmt.Errorf("mmap stat error: %s", err) } else if int(info.Size()) < db.pageSize*2 { return fmt.Errorf("file size too small") } // Ensure the size is at least the minimum size. var size = int(info.Size()) if size < minsz { size = minsz } size, err = db.mmapSize(size) if err != nil { return err } // Unmap existing data before continuing. if err := db.munmap(); err != nil { return err } // Memory-map the data file as a byte slice. if err := mmap(db, size); err != nil { return err } // Save references to the meta pages. db.meta0 = db.page(0).meta() db.meta1 = db.page(1).meta() // Validate the meta pages. We only return an error if both meta pages fail // validation, since meta0 failing validation means that it wasn't saved // properly -- but we can recover using meta1. And vice-versa. err0 := db.meta0.validate() err1 := db.meta1.validate() if err0 != nil && err1 != nil { return err0 } return nil } // munmap unmaps the data file from memory. func (db *DB) munmap() error { if err := munmap(db); err != nil { return fmt.Errorf("unmap error: %s", err) } return nil } // mmapSize determines the appropriate size for the mmap given the current size // of the database. The minimum size is 32KB and doubles until it reaches 1GB. // Returns an error if the new mmap size is greater than the max allowed. func (db *DB) mmapSize(size int) (int, error) { // Double the size from 32KB until 1GB. for i := uint(15); i <= 30; i++ { if size <= 1< maxMapSize { return 0, fmt.Errorf("mmap too large") } // If larger than 1GB then grow by 1GB at a time. sz := int64(size) if remainder := sz % int64(maxMmapStep); remainder > 0 { sz += int64(maxMmapStep) - remainder } // Ensure that the mmap size is a multiple of the page size. // This should always be true since we're incrementing in MBs. pageSize := int64(db.pageSize) if (sz % pageSize) != 0 { sz = ((sz / pageSize) + 1) * pageSize } // If we've exceeded the max size then only grow up to the max size. if sz > maxMapSize { sz = maxMapSize } return int(sz), nil } func (db *DB) String() string { return fmt.Sprintf("PageBuf<%s>", db.path) } // Path returns the path to the currently opened pagebuf file. func (db *DB) Path() string { return db.path } // Begin starts a new transaction. // Multiple read-only transactions can be used concurrently but only one // write transaction can be used at a time. Starting multiple write transactions // will cause the calls to block and be serialized until the current write // transaction finishes. // // Transactions should not be dependent on one another. Opening a read // transaction and a write transaction in the same goroutine can cause the // writer to deadlock because the database periodically needs to re-mmap itself // as it grows and it cannot do that while a read transaction is open. // // If a long running read transaction (for example, a snapshot transaction) is // needed, you might want to set PageBuf.InitialMmapSize to a large enough value // to avoid potential blocking of write transaction. // // IMPORTANT: You must close read-only transactions after you are finished or // else the database will not reclaim old pages. func (db *DB) Begin(writable bool) (*Tx, error) { if writable { return db.beginRWTx() } return db.beginTx() } func (db *DB) beginTx() (*Tx, error) { // Lock the meta pages while we initialize the transaction. We obtain // the meta lock before the mmap lock because that's the order that the // write transaction will obtain them. db.metalock.Lock() // Obtain a read-only lock on the mmap. When the mmap is remapped it will // obtain a write lock so all transactions must finish before it can be // remapped. db.mmaplock.RLock() // Exit if the database is not open yet. if !db.opened { db.mmaplock.RUnlock() db.metalock.Unlock() return nil, ErrDatabaseNotOpen } // Create a transaction associated with the database. t := &Tx{} t.init(db) // Keep track of transaction until it closes. db.txs = append(db.txs, t) // Unlock the meta pages. db.metalock.Unlock() return t, nil } func (db *DB) beginRWTx() (*Tx, error) { // Obtain writer lock. This is released by the transaction when it closes. // This enforces only one writer transaction at a time. db.rwlock.Lock() // Once we have the writer lock then we can lock the meta pages so that // we can set up the transaction. db.metalock.Lock() defer db.metalock.Unlock() // Exit if the database is not open yet. if !db.opened { db.rwlock.Unlock() return nil, ErrDatabaseNotOpen } // Create a transaction associated with the database. t := &Tx{writable: true} t.init(db) db.rwtx = t // Free any pages associated with closed read-only transactions. var minid txid = 0xFFFFFFFFFFFFFFFF for _, t := range db.txs { if t.meta.txid < minid { minid = t.meta.txid } } if minid > 0 { db.freelist.release(minid - 1) } return t, nil } // removeTx removes a transaction from the database. func (db *DB) removeTx(tx *Tx) { // Release the read lock on the mmap. db.mmaplock.RUnlock() // Use the meta lock to restrict access to the DB object. db.metalock.Lock() // Remove the transaction. for i, t := range db.txs { if t == tx { db.txs = append(db.txs[:i], db.txs[i+1:]...) break } } // Unlock the meta pages. db.metalock.Unlock() } // Size represents a valid page size. type Size int8 // The valid sizes for allocated pages. const ( Size512 Size = -3 Size1024 = -2 Size2048 = -1 Size4096 = 0 Size8192 = 1 ) const ( upageSizeMin = Size512 upageSizeMax = Size8192 ) type meta struct { magic uint32 version uint32 pageSize uint32 flags uint32 freelist pgid txid txid pgid pgid checksum uint64 } // validate checks the marker bytes and version of the meta page to ensure it matches this binary. func (m *meta) validate() error { if m.magic != magic { return ErrInvalid } else if m.version != version { return ErrVersionMismatch } else if m.checksum != 0 && m.checksum != m.sum64() { return ErrChecksum } return nil } // copy copies one meta object to another. func (m *meta) copy(dest *meta) { *dest = *m } // write writes the meta onto a page. func (m *meta) write(p *page) { if m.freelist >= m.pgid { panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid)) } // Page id is either going to be 0 or 1 which we can determine by the transaction ID. p.id = pgid(m.txid % 2) p.flags |= pageFlagMeta // Calculate the checksum. m.checksum = m.sum64() m.copy(p.meta()) } // generates the checksum for the meta. func (m *meta) sum64() uint64 { var h = fnv.New64a() _, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:]) return h.Sum64() } // _assert will panic with a given formatted message if the given condition is false. func _assert(condition bool, msg string, v ...interface{}) { if !condition { panic(fmt.Sprintf("assertion failed: "+msg, v...)) } }