summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/table/format.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/rocksdb/table/format.h')
-rw-r--r--src/rocksdb/table/format.h375
1 files changed, 375 insertions, 0 deletions
diff --git a/src/rocksdb/table/format.h b/src/rocksdb/table/format.h
new file mode 100644
index 000000000..ffb9fb0ca
--- /dev/null
+++ b/src/rocksdb/table/format.h
@@ -0,0 +1,375 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <string>
+
+#include "file/file_prefetch_buffer.h"
+#include "file/random_access_file_reader.h"
+#include "memory/memory_allocator.h"
+#include "options/cf_options.h"
+#include "port/malloc.h"
+#include "port/port.h" // noexcept
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFile;
+struct ReadOptions;
+
+bool ShouldReportDetailedTime(Env* env, Statistics* stats);
+
+// the length of the magic number in bytes.
+constexpr uint32_t kMagicNumberLengthByte = 8;
+
+// BlockHandle is a pointer to the extent of a file that stores a data
+// block or a meta block.
+class BlockHandle {
+ public:
+ // Creates a block handle with special values indicating "uninitialized,"
+ // distinct from the "null" block handle.
+ BlockHandle();
+ BlockHandle(uint64_t offset, uint64_t size);
+
+ // The offset of the block in the file.
+ uint64_t offset() const { return offset_; }
+ void set_offset(uint64_t _offset) { offset_ = _offset; }
+
+ // The size of the stored block
+ uint64_t size() const { return size_; }
+ void set_size(uint64_t _size) { size_ = _size; }
+
+ void EncodeTo(std::string* dst) const;
+ char* EncodeTo(char* dst) const;
+ Status DecodeFrom(Slice* input);
+ Status DecodeSizeFrom(uint64_t offset, Slice* input);
+
+ // Return a string that contains the copy of handle.
+ std::string ToString(bool hex = true) const;
+
+ // if the block handle's offset and size are both "0", we will view it
+ // as a null block handle that points to no where.
+ bool IsNull() const { return offset_ == 0 && size_ == 0; }
+
+ static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }
+
+ // Maximum encoding length of a BlockHandle
+ static constexpr uint32_t kMaxEncodedLength = 2 * kMaxVarint64Length;
+
+ inline bool operator==(const BlockHandle& rhs) const {
+ return offset_ == rhs.offset_ && size_ == rhs.size_;
+ }
+ inline bool operator!=(const BlockHandle& rhs) const {
+ return !(*this == rhs);
+ }
+
+ private:
+ uint64_t offset_;
+ uint64_t size_;
+
+ static const BlockHandle kNullBlockHandle;
+};
+
+// Value in block-based table file index.
+//
+// The index entry for block n is: y -> h, [x],
+// where: y is some key between the last key of block n (inclusive) and the
+// first key of block n+1 (exclusive); h is BlockHandle pointing to block n;
+// x, if present, is the first key of block n (unshortened).
+// This struct represents the "h, [x]" part.
+struct IndexValue {
+ BlockHandle handle;
+ // Empty means unknown.
+ Slice first_internal_key;
+
+ IndexValue() = default;
+ IndexValue(BlockHandle _handle, Slice _first_internal_key)
+ : handle(_handle), first_internal_key(_first_internal_key) {}
+
+ // have_first_key indicates whether the `first_internal_key` is used.
+ // If previous_handle is not null, delta encoding is used;
+ // in this case, the two handles must point to consecutive blocks:
+ // handle.offset() ==
+ // previous_handle->offset() + previous_handle->size() + kBlockTrailerSize
+ void EncodeTo(std::string* dst, bool have_first_key,
+ const BlockHandle* previous_handle) const;
+ Status DecodeFrom(Slice* input, bool have_first_key,
+ const BlockHandle* previous_handle);
+
+ std::string ToString(bool hex, bool have_first_key) const;
+};
+
+inline uint32_t GetCompressFormatForVersion(uint32_t format_version) {
+ // As of format_version 2, we encode compressed block with
+ // compress_format_version == 2. Before that, the version is 1.
+ // DO NOT CHANGE THIS FUNCTION, it affects disk format
+ return format_version >= 2 ? 2 : 1;
+}
+
+constexpr uint32_t kLatestFormatVersion = 5;
+
+inline bool IsSupportedFormatVersion(uint32_t version) {
+ return version <= kLatestFormatVersion;
+}
+
+// Footer encapsulates the fixed information stored at the tail end of every
+// SST file. In general, it should only include things that cannot go
+// elsewhere under the metaindex block. For example, checksum_type is
+// required for verifying metaindex block checksum (when applicable), but
+// index block handle can easily go in metaindex block (possible future).
+// See also FooterBuilder below.
+class Footer {
+ public:
+ // Create empty. Populate using DecodeFrom.
+ Footer() {}
+
+ // Deserialize a footer (populate fields) from `input` and check for various
+ // corruptions. `input_offset` is the offset within the target file of
+ // `input` buffer (future use).
+ Status DecodeFrom(Slice input, uint64_t input_offset);
+
+ // Table magic number identifies file as RocksDB SST file and which kind of
+ // SST format is use.
+ uint64_t table_magic_number() const { return table_magic_number_; }
+
+ // A version (footer and more) within a kind of SST. (It would add more
+ // unnecessary complexity to separate footer versions and
+ // BBTO::format_version.)
+ uint32_t format_version() const { return format_version_; }
+
+ // Block handle for metaindex block.
+ const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
+
+ // Block handle for (top-level) index block.
+ const BlockHandle& index_handle() const { return index_handle_; }
+
+ // Checksum type used in the file.
+ ChecksumType checksum_type() const {
+ return static_cast<ChecksumType>(checksum_type_);
+ }
+
+ // Block trailer size used by file with this footer (e.g. 5 for block-based
+ // table and 0 for plain table). This is inferred from magic number so
+ // not in the serialized form.
+ inline size_t GetBlockTrailerSize() const { return block_trailer_size_; }
+
+ // Convert this object to a human readable form
+ std::string ToString() const;
+
+ // Encoded lengths of Footers. Bytes for serialized Footer will always be
+ // >= kMinEncodedLength and <= kMaxEncodedLength.
+ //
+ // Footer version 0 (legacy) will always occupy exactly this many bytes.
+ // It consists of two block handles, padding, and a magic number.
+ static constexpr uint32_t kVersion0EncodedLength =
+ 2 * BlockHandle::kMaxEncodedLength + kMagicNumberLengthByte;
+ static constexpr uint32_t kMinEncodedLength = kVersion0EncodedLength;
+
+ // Footer of versions 1 and higher will always occupy exactly this many
+ // bytes. It originally consisted of the checksum type, two block handles,
+ // padding (to maximum handle encoding size), a format version number, and a
+ // magic number.
+ static constexpr uint32_t kNewVersionsEncodedLength =
+ 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + kMagicNumberLengthByte;
+ static constexpr uint32_t kMaxEncodedLength = kNewVersionsEncodedLength;
+
+ static constexpr uint64_t kNullTableMagicNumber = 0;
+
+ static constexpr uint32_t kInvalidFormatVersion = 0xffffffffU;
+
+ private:
+ static constexpr int kInvalidChecksumType =
+ (1 << (sizeof(ChecksumType) * 8)) | kNoChecksum;
+
+ uint64_t table_magic_number_ = kNullTableMagicNumber;
+ uint32_t format_version_ = kInvalidFormatVersion;
+ BlockHandle metaindex_handle_;
+ BlockHandle index_handle_;
+ int checksum_type_ = kInvalidChecksumType;
+ uint8_t block_trailer_size_ = 0;
+};
+
+// Builder for Footer
+class FooterBuilder {
+ public:
+ // Run builder in inputs. This is a single step with lots of parameters for
+ // efficiency (based on perf testing).
+ // * table_magic_number identifies file as RocksDB SST file and which kind of
+ // SST format is use.
+ // * format_version is a version for the footer and can also apply to other
+ // aspects of the SST file (see BlockBasedTableOptions::format_version).
+ // NOTE: To save complexity in the caller, when format_version == 0 and
+ // there is a corresponding legacy magic number to the one specified, the
+ // legacy magic number will be written for forward compatibility.
+ // * footer_offset is the file offset where the footer will be written
+ // (for future use).
+ // * checksum_type is for formats using block checksums.
+ // * index_handle is optional for some kinds of SST files.
+ void Build(uint64_t table_magic_number, uint32_t format_version,
+ uint64_t footer_offset, ChecksumType checksum_type,
+ const BlockHandle& metaindex_handle,
+ const BlockHandle& index_handle = BlockHandle::NullBlockHandle());
+
+ // After Builder, get a Slice for the serialized Footer, backed by this
+ // FooterBuilder.
+ const Slice& GetSlice() const {
+ assert(slice_.size());
+ return slice_;
+ }
+
+ private:
+ Slice slice_;
+ std::array<char, Footer::kMaxEncodedLength> data_;
+};
+
+// Read the footer from file
+// If enforce_table_magic_number != 0, ReadFooterFromFile() will return
+// corruption if table_magic number is not equal to enforce_table_magic_number
+Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
+ FilePrefetchBuffer* prefetch_buffer,
+ uint64_t file_size, Footer* footer,
+ uint64_t enforce_table_magic_number = 0);
+
+// Computes a checksum using the given ChecksumType. Sometimes we need to
+// include one more input byte logically at the end but not part of the main
+// data buffer. If data_size >= 1, then
+// ComputeBuiltinChecksum(type, data, size)
+// ==
+// ComputeBuiltinChecksumWithLastByte(type, data, size - 1, data[size - 1])
+uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
+ size_t size);
+uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
+ size_t size, char last_byte);
+
+// Represents the contents of a block read from an SST file. Depending on how
+// it's created, it may or may not own the actual block bytes. As an example,
+// BlockContents objects representing data read from mmapped files only point
+// into the mmapped region. Depending on context, it might be a serialized
+// (potentially compressed) block, including a trailer beyond `size`, or an
+// uncompressed block.
+//
+// Please try to use this terminology when dealing with blocks:
+// * "Serialized block" - bytes that go into storage. For block-based table
+// (usually the case) this includes the block trailer. Here the `size` does
+// not include the trailer, but other places in code might include the trailer
+// in the size.
+// * "Maybe compressed block" - like a serialized block, but without the
+// trailer (or no promise of including a trailer). Must be accompanied by a
+// CompressionType in some other variable or field.
+// * "Uncompressed block" - "payload" bytes that are either stored with no
+// compression, used as input to compression function, or result of
+// decompression function.
+// * "Parsed block" - an in-memory form of a block in block cache, as it is
+// used by the table reader. Different C++ types are used depending on the
+// block type (see block_like_traits.h). Only trivially parsable block types
+// use BlockContents as the parsed form.
+//
+struct BlockContents {
+ // Points to block payload (without trailer)
+ Slice data;
+ CacheAllocationPtr allocation;
+
+#ifndef NDEBUG
+ // Whether there is a known trailer after what is pointed to by `data`.
+ // See BlockBasedTable::GetCompressionType.
+ bool has_trailer = false;
+#endif // NDEBUG
+
+ BlockContents() {}
+
+ // Does not take ownership of the underlying data bytes.
+ BlockContents(const Slice& _data) : data(_data) {}
+
+ // Takes ownership of the underlying data bytes.
+ BlockContents(CacheAllocationPtr&& _data, size_t _size)
+ : data(_data.get(), _size), allocation(std::move(_data)) {}
+
+ // Takes ownership of the underlying data bytes.
+ BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
+ : data(_data.get(), _size) {
+ allocation.reset(_data.release());
+ }
+
+ // Returns whether the object has ownership of the underlying data bytes.
+ bool own_bytes() const { return allocation.get() != nullptr; }
+
+ // The additional memory space taken by the block data.
+ size_t usable_size() const {
+ if (allocation.get() != nullptr) {
+ auto allocator = allocation.get_deleter().allocator;
+ if (allocator) {
+ return allocator->UsableSize(allocation.get(), data.size());
+ }
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ return malloc_usable_size(allocation.get());
+#else
+ return data.size();
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ } else {
+ return 0; // no extra memory is occupied by the data
+ }
+ }
+
+ size_t ApproximateMemoryUsage() const {
+ return usable_size() + sizeof(*this);
+ }
+
+ BlockContents(BlockContents&& other) noexcept { *this = std::move(other); }
+
+ BlockContents& operator=(BlockContents&& other) {
+ data = std::move(other.data);
+ allocation = std::move(other.allocation);
+#ifndef NDEBUG
+ has_trailer = other.has_trailer;
+#endif // NDEBUG
+ return *this;
+ }
+};
+
+// The `data` points to serialized block contents read in from file, which
+// must be compressed and include a trailer beyond `size`. A new buffer is
+// allocated with the given allocator (or default) and the uncompressed
+// contents are returned in `out_contents`.
+// format_version is as defined in include/rocksdb/table.h, which is
+// used to determine compression format version.
+Status UncompressSerializedBlock(const UncompressionInfo& info,
+ const char* data, size_t size,
+ BlockContents* out_contents,
+ uint32_t format_version,
+ const ImmutableOptions& ioptions,
+ MemoryAllocator* allocator = nullptr);
+
+// This is a variant of UncompressSerializedBlock that does not expect a
+// block trailer beyond `size`. (CompressionType is taken from `info`.)
+Status UncompressBlockData(const UncompressionInfo& info, const char* data,
+ size_t size, BlockContents* out_contents,
+ uint32_t format_version,
+ const ImmutableOptions& ioptions,
+ MemoryAllocator* allocator = nullptr);
+
+// Replace db_host_id contents with the real hostname if necessary
+Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id);
+
+// Implementation details follow. Clients should ignore,
+
+// TODO(andrewkr): we should prefer one way of representing a null/uninitialized
+// BlockHandle. Currently we use zeros for null and use negation-of-zeros for
+// uninitialized.
+inline BlockHandle::BlockHandle() : BlockHandle(~uint64_t{0}, ~uint64_t{0}) {}
+
+inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
+ : offset_(_offset), size_(_size) {}
+
+} // namespace ROCKSDB_NAMESPACE