// Copyright 2019 The go-ethereum Authors // This file is part of the go-ethereum library. // // The go-ethereum library is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // The go-ethereum library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. package rawdb import ( "encoding/binary" "errors" "fmt" "io" "os" "path/filepath" "sync" "sync/atomic" "github.com/ethereum/go-ethereum/common" "github.com/ethereum/go-ethereum/log" "github.com/ethereum/go-ethereum/metrics" "github.com/golang/snappy" ) var ( // errClosed is returned if an operation attempts to read from or write to the // freezer table after it has already been closed. errClosed = errors.New("closed") // errOutOfBounds is returned if the item requested is not contained within the // freezer table. errOutOfBounds = errors.New("out of bounds") // errNotSupported is returned if the database doesn't support the required operation. errNotSupported = errors.New("this operation is not supported") ) // indexEntry contains the number/id of the file that the data resides in, aswell as the // offset within the file to the end of the data // In serialized form, the filenum is stored as uint16. type indexEntry struct { filenum uint32 // stored as uint16 ( 2 bytes) offset uint32 // stored as uint32 ( 4 bytes) } const indexEntrySize = 6 // unmarshallBinary deserializes binary b into the rawIndex entry. func (i *indexEntry) unmarshalBinary(b []byte) error { i.filenum = uint32(binary.BigEndian.Uint16(b[:2])) i.offset = binary.BigEndian.Uint32(b[2:6]) return nil } // marshallBinary serializes the rawIndex entry into binary. func (i *indexEntry) marshallBinary() []byte { b := make([]byte, indexEntrySize) binary.BigEndian.PutUint16(b[:2], uint16(i.filenum)) binary.BigEndian.PutUint32(b[2:6], i.offset) return b } // freezerTable represents a single chained data table within the freezer (e.g. blocks). // It consists of a data file (snappy encoded arbitrary data blobs) and an indexEntry // file (uncompressed 64 bit indices into the data file). type freezerTable struct { // WARNING: The `items` field is accessed atomically. On 32 bit platforms, only // 64-bit aligned fields can be atomic. The struct is guaranteed to be so aligned, // so take advantage of that (https://golang.org/pkg/sync/atomic/#pkg-note-BUG). items uint64 // Number of items stored in the table (including items removed from tail) noCompression bool // if true, disables snappy compression. Note: does not work retroactively maxFileSize uint32 // Max file size for data-files name string path string head *os.File // File descriptor for the data head of the table files map[uint32]*os.File // open files headId uint32 // number of the currently active head file tailId uint32 // number of the earliest file index *os.File // File descriptor for the indexEntry file of the table // In the case that old items are deleted (from the tail), we use itemOffset // to count how many historic items have gone missing. itemOffset uint32 // Offset (number of discarded items) headBytes uint32 // Number of bytes written to the head file readMeter metrics.Meter // Meter for measuring the effective amount of data read writeMeter metrics.Meter // Meter for measuring the effective amount of data written sizeGauge metrics.Gauge // Gauge for tracking the combined size of all freezer tables logger log.Logger // Logger with database path and table name ambedded lock sync.RWMutex // Mutex protecting the data file descriptors } // newTable opens a freezer table with default settings - 2G files func newTable(path string, name string, readMeter metrics.Meter, writeMeter metrics.Meter, sizeGauge metrics.Gauge, disableSnappy bool) (*freezerTable, error) { return newCustomTable(path, name, readMeter, writeMeter, sizeGauge, 2*1000*1000*1000, disableSnappy) } // openFreezerFileForAppend opens a freezer table file and seeks to the end func openFreezerFileForAppend(filename string) (*os.File, error) { // Open the file without the O_APPEND flag // because it has differing behaviour during Truncate operations // on different OS's file, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE, 0644) if err != nil { return nil, err } // Seek to end for append if _, err = file.Seek(0, io.SeekEnd); err != nil { return nil, err } return file, nil } // openFreezerFileForReadOnly opens a freezer table file for read only access func openFreezerFileForReadOnly(filename string) (*os.File, error) { return os.OpenFile(filename, os.O_RDONLY, 0644) } // openFreezerFileTruncated opens a freezer table making sure it is truncated func openFreezerFileTruncated(filename string) (*os.File, error) { return os.OpenFile(filename, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0644) } // truncateFreezerFile resizes a freezer table file and seeks to the end func truncateFreezerFile(file *os.File, size int64) error { if err := file.Truncate(size); err != nil { return err } // Seek to end for append if _, err := file.Seek(0, io.SeekEnd); err != nil { return err } return nil } // newCustomTable opens a freezer table, creating the data and index files if they are // non existent. Both files are truncated to the shortest common length to ensure // they don't go out of sync. func newCustomTable(path string, name string, readMeter metrics.Meter, writeMeter metrics.Meter, sizeGauge metrics.Gauge, maxFilesize uint32, noCompression bool) (*freezerTable, error) { // Ensure the containing directory exists and open the indexEntry file if err := os.MkdirAll(path, 0755); err != nil { return nil, err } var idxName string if noCompression { // Raw idx idxName = fmt.Sprintf("%s.ridx", name) } else { // Compressed idx idxName = fmt.Sprintf("%s.cidx", name) } offsets, err := openFreezerFileForAppend(filepath.Join(path, idxName)) if err != nil { return nil, err } // Create the table and repair any past inconsistency tab := &freezerTable{ index: offsets, files: make(map[uint32]*os.File), readMeter: readMeter, writeMeter: writeMeter, sizeGauge: sizeGauge, name: name, path: path, logger: log.New("database", path, "table", name), noCompression: noCompression, maxFileSize: maxFilesize, } if err := tab.repair(); err != nil { tab.Close() return nil, err } // Initialize the starting size counter size, err := tab.sizeNolock() if err != nil { tab.Close() return nil, err } tab.sizeGauge.Inc(int64(size)) return tab, nil } // repair cross checks the head and the index file and truncates them to // be in sync with each other after a potential crash / data loss. func (t *freezerTable) repair() error { // Create a temporary offset buffer to init files with and read indexEntry into buffer := make([]byte, indexEntrySize) // If we've just created the files, initialize the index with the 0 indexEntry stat, err := t.index.Stat() if err != nil { return err } if stat.Size() == 0 { if _, err := t.index.Write(buffer); err != nil { return err } } // Ensure the index is a multiple of indexEntrySize bytes if overflow := stat.Size() % indexEntrySize; overflow != 0 { truncateFreezerFile(t.index, stat.Size()-overflow) // New file can't trigger this path } // Retrieve the file sizes and prepare for truncation if stat, err = t.index.Stat(); err != nil { return err } offsetsSize := stat.Size() // Open the head file var ( firstIndex indexEntry lastIndex indexEntry contentSize int64 contentExp int64 ) // Read index zero, determine what file is the earliest // and what item offset to use t.index.ReadAt(buffer, 0) firstIndex.unmarshalBinary(buffer) t.tailId = firstIndex.filenum t.itemOffset = firstIndex.offset t.index.ReadAt(buffer, offsetsSize-indexEntrySize) lastIndex.unmarshalBinary(buffer) t.head, err = t.openFile(lastIndex.filenum, openFreezerFileForAppend) if err != nil { return err } if stat, err = t.head.Stat(); err != nil { return err } contentSize = stat.Size() // Keep truncating both files until they come in sync contentExp = int64(lastIndex.offset) for contentExp != contentSize { // Truncate the head file to the last offset pointer if contentExp < contentSize { t.logger.Warn("Truncating dangling head", "indexed", common.StorageSize(contentExp), "stored", common.StorageSize(contentSize)) if err := truncateFreezerFile(t.head, contentExp); err != nil { return err } contentSize = contentExp } // Truncate the index to point within the head file if contentExp > contentSize { t.logger.Warn("Truncating dangling indexes", "indexed", common.StorageSize(contentExp), "stored", common.StorageSize(contentSize)) if err := truncateFreezerFile(t.index, offsetsSize-indexEntrySize); err != nil { return err } offsetsSize -= indexEntrySize t.index.ReadAt(buffer, offsetsSize-indexEntrySize) var newLastIndex indexEntry newLastIndex.unmarshalBinary(buffer) // We might have slipped back into an earlier head-file here if newLastIndex.filenum != lastIndex.filenum { // Release earlier opened file t.releaseFile(lastIndex.filenum) if t.head, err = t.openFile(newLastIndex.filenum, openFreezerFileForAppend); err != nil { return err } if stat, err = t.head.Stat(); err != nil { // TODO, anything more we can do here? // A data file has gone missing... return err } contentSize = stat.Size() } lastIndex = newLastIndex contentExp = int64(lastIndex.offset) } } // Ensure all reparation changes have been written to disk if err := t.index.Sync(); err != nil { return err } if err := t.head.Sync(); err != nil { return err } // Update the item and byte counters and return t.items = uint64(t.itemOffset) + uint64(offsetsSize/indexEntrySize-1) // last indexEntry points to the end of the data file t.headBytes = uint32(contentSize) t.headId = lastIndex.filenum // Close opened files and preopen all files if err := t.preopen(); err != nil { return err } t.logger.Debug("Chain freezer table opened", "items", t.items, "size", common.StorageSize(t.headBytes)) return nil } // preopen opens all files that the freezer will need. This method should be called from an init-context, // since it assumes that it doesn't have to bother with locking // The rationale for doing preopen is to not have to do it from within Retrieve, thus not needing to ever // obtain a write-lock within Retrieve. func (t *freezerTable) preopen() (err error) { // The repair might have already opened (some) files t.releaseFilesAfter(0, false) // Open all except head in RDONLY for i := t.tailId; i < t.headId; i++ { if _, err = t.openFile(i, openFreezerFileForReadOnly); err != nil { return err } } // Open head in read/write t.head, err = t.openFile(t.headId, openFreezerFileForAppend) return err } // truncate discards any recent data above the provided threshold number. func (t *freezerTable) truncate(items uint64) error { t.lock.Lock() defer t.lock.Unlock() // If our item count is correct, don't do anything existing := atomic.LoadUint64(&t.items) if existing <= items { return nil } // We need to truncate, save the old size for metrics tracking oldSize, err := t.sizeNolock() if err != nil { return err } // Something's out of sync, truncate the table's offset index log := t.logger.Debug if existing > items+1 { log = t.logger.Warn // Only loud warn if we delete multiple items } log("Truncating freezer table", "items", existing, "limit", items) if err := truncateFreezerFile(t.index, int64(items+1)*indexEntrySize); err != nil { return err } // Calculate the new expected size of the data file and truncate it buffer := make([]byte, indexEntrySize) if _, err := t.index.ReadAt(buffer, int64(items*indexEntrySize)); err != nil { return err } var expected indexEntry expected.unmarshalBinary(buffer) // We might need to truncate back to older files if expected.filenum != t.headId { // If already open for reading, force-reopen for writing t.releaseFile(expected.filenum) newHead, err := t.openFile(expected.filenum, openFreezerFileForAppend) if err != nil { return err } // Release any files _after the current head -- both the previous head // and any files which may have been opened for reading t.releaseFilesAfter(expected.filenum, true) // Set back the historic head t.head = newHead atomic.StoreUint32(&t.headId, expected.filenum) } if err := truncateFreezerFile(t.head, int64(expected.offset)); err != nil { return err } // All data files truncated, set internal counters and return atomic.StoreUint64(&t.items, items) atomic.StoreUint32(&t.headBytes, expected.offset) // Retrieve the new size and update the total size counter newSize, err := t.sizeNolock() if err != nil { return err } t.sizeGauge.Dec(int64(oldSize - newSize)) return nil } // Close closes all opened files. func (t *freezerTable) Close() error { t.lock.Lock() defer t.lock.Unlock() var errs []error if err := t.index.Close(); err != nil { errs = append(errs, err) } t.index = nil for _, f := range t.files { if err := f.Close(); err != nil { errs = append(errs, err) } } t.head = nil if errs != nil { return fmt.Errorf("%v", errs) } return nil } // openFile assumes that the write-lock is held by the caller func (t *freezerTable) openFile(num uint32, opener func(string) (*os.File, error)) (f *os.File, err error) { var exist bool if f, exist = t.files[num]; !exist { var name string if t.noCompression { name = fmt.Sprintf("%s.%04d.rdat", t.name, num) } else { name = fmt.Sprintf("%s.%04d.cdat", t.name, num) } f, err = opener(filepath.Join(t.path, name)) if err != nil { return nil, err } t.files[num] = f } return f, err } // releaseFile closes a file, and removes it from the open file cache. // Assumes that the caller holds the write lock func (t *freezerTable) releaseFile(num uint32) { if f, exist := t.files[num]; exist { delete(t.files, num) f.Close() } } // releaseFilesAfter closes all open files with a higher number, and optionally also deletes the files func (t *freezerTable) releaseFilesAfter(num uint32, remove bool) { for fnum, f := range t.files { if fnum > num { delete(t.files, fnum) f.Close() if remove { os.Remove(f.Name()) } } } } // Append injects a binary blob at the end of the freezer table. The item number // is a precautionary parameter to ensure data correctness, but the table will // reject already existing data. // // Note, this method will *not* flush any data to disk so be sure to explicitly // fsync before irreversibly deleting data from the database. func (t *freezerTable) Append(item uint64, blob []byte) error { // Read lock prevents competition with truncate t.lock.RLock() // Ensure the table is still accessible if t.index == nil || t.head == nil { t.lock.RUnlock() return errClosed } // Ensure only the next item can be written, nothing else if atomic.LoadUint64(&t.items) != item { t.lock.RUnlock() return fmt.Errorf("appending unexpected item: want %d, have %d", t.items, item) } // Encode the blob and write it into the data file if !t.noCompression { blob = snappy.Encode(nil, blob) } bLen := uint32(len(blob)) if t.headBytes+bLen < bLen || t.headBytes+bLen > t.maxFileSize { // we need a new file, writing would overflow t.lock.RUnlock() t.lock.Lock() nextID := atomic.LoadUint32(&t.headId) + 1 // We open the next file in truncated mode -- if this file already // exists, we need to start over from scratch on it newHead, err := t.openFile(nextID, openFreezerFileTruncated) if err != nil { t.lock.Unlock() return err } // Close old file, and reopen in RDONLY mode t.releaseFile(t.headId) t.openFile(t.headId, openFreezerFileForReadOnly) // Swap out the current head t.head = newHead atomic.StoreUint32(&t.headBytes, 0) atomic.StoreUint32(&t.headId, nextID) t.lock.Unlock() t.lock.RLock() } defer t.lock.RUnlock() if _, err := t.head.Write(blob); err != nil { return err } newOffset := atomic.AddUint32(&t.headBytes, bLen) idx := indexEntry{ filenum: atomic.LoadUint32(&t.headId), offset: newOffset, } // Write indexEntry t.index.Write(idx.marshallBinary()) t.writeMeter.Mark(int64(bLen + indexEntrySize)) t.sizeGauge.Inc(int64(bLen + indexEntrySize)) atomic.AddUint64(&t.items, 1) return nil } // getBounds returns the indexes for the item // returns start, end, filenumber and error func (t *freezerTable) getBounds(item uint64) (uint32, uint32, uint32, error) { buffer := make([]byte, indexEntrySize) var startIdx, endIdx indexEntry // Read second index if _, err := t.index.ReadAt(buffer, int64((item+1)*indexEntrySize)); err != nil { return 0, 0, 0, err } endIdx.unmarshalBinary(buffer) // Read first index (unless it's the very first item) if item != 0 { if _, err := t.index.ReadAt(buffer, int64(item*indexEntrySize)); err != nil { return 0, 0, 0, err } startIdx.unmarshalBinary(buffer) } else { // Special case if we're reading the first item in the freezer. We assume that // the first item always start from zero(regarding the deletion, we // only support deletion by files, so that the assumption is held). // This means we can use the first item metadata to carry information about // the 'global' offset, for the deletion-case return 0, endIdx.offset, endIdx.filenum, nil } if startIdx.filenum != endIdx.filenum { // If a piece of data 'crosses' a data-file, // it's actually in one piece on the second data-file. // We return a zero-indexEntry for the second file as start return 0, endIdx.offset, endIdx.filenum, nil } return startIdx.offset, endIdx.offset, endIdx.filenum, nil } // Retrieve looks up the data offset of an item with the given number and retrieves // the raw binary blob from the data file. func (t *freezerTable) Retrieve(item uint64) ([]byte, error) { t.lock.RLock() // Ensure the table and the item is accessible if t.index == nil || t.head == nil { t.lock.RUnlock() return nil, errClosed } if atomic.LoadUint64(&t.items) <= item { t.lock.RUnlock() return nil, errOutOfBounds } // Ensure the item was not deleted from the tail either if uint64(t.itemOffset) > item { t.lock.RUnlock() return nil, errOutOfBounds } startOffset, endOffset, filenum, err := t.getBounds(item - uint64(t.itemOffset)) if err != nil { t.lock.RUnlock() return nil, err } dataFile, exist := t.files[filenum] if !exist { t.lock.RUnlock() return nil, fmt.Errorf("missing data file %d", filenum) } // Retrieve the data itself, decompress and return blob := make([]byte, endOffset-startOffset) if _, err := dataFile.ReadAt(blob, int64(startOffset)); err != nil { t.lock.RUnlock() return nil, err } t.lock.RUnlock() t.readMeter.Mark(int64(len(blob) + 2*indexEntrySize)) if t.noCompression { return blob, nil } return snappy.Decode(nil, blob) } // has returns an indicator whether the specified number data // exists in the freezer table. func (t *freezerTable) has(number uint64) bool { return atomic.LoadUint64(&t.items) > number } // size returns the total data size in the freezer table. func (t *freezerTable) size() (uint64, error) { t.lock.RLock() defer t.lock.RUnlock() return t.sizeNolock() } // sizeNolock returns the total data size in the freezer table without obtaining // the mutex first. func (t *freezerTable) sizeNolock() (uint64, error) { stat, err := t.index.Stat() if err != nil { return 0, err } total := uint64(t.maxFileSize)*uint64(t.headId-t.tailId) + uint64(t.headBytes) + uint64(stat.Size()) return total, nil } // Sync pushes any pending data from memory out to disk. This is an expensive // operation, so use it with care. func (t *freezerTable) Sync() error { if err := t.index.Sync(); err != nil { return err } return t.head.Sync() } // printIndex is a debug print utility function for testing func (t *freezerTable) printIndex() { buf := make([]byte, indexEntrySize) fmt.Printf("|-----------------|\n") fmt.Printf("| fileno | offset |\n") fmt.Printf("|--------+--------|\n") for i := uint64(0); ; i++ { if _, err := t.index.ReadAt(buf, int64(i*indexEntrySize)); err != nil { break } var entry indexEntry entry.unmarshalBinary(buf) fmt.Printf("| %03d | %03d | \n", entry.filenum, entry.offset) if i > 100 { fmt.Printf(" ... \n") break } } fmt.Printf("|-----------------|\n") }