summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/table/plain/plain_table_key_coding.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/rocksdb/table/plain/plain_table_key_coding.h201
1 files changed, 201 insertions, 0 deletions
diff --git a/src/rocksdb/table/plain/plain_table_key_coding.h b/src/rocksdb/table/plain/plain_table_key_coding.h
new file mode 100644
index 000000000..9cda7df32
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_key_coding.h
@@ -0,0 +1,201 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <array>
+
+#include "rocksdb/slice.h"
+#include "table/plain/plain_table_reader.h"
+
+// The file contains three helper classes of PlainTable format,
+// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader.
+// These classes issue the lowest level of operations of PlainTable.
+// Actual data format of the key is documented in comments of class
+// PlainTableFactory.
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFile;
+struct ParsedInternalKey;
+struct PlainTableReaderFileInfo;
+enum PlainTableEntryType : unsigned char;
+
+// Helper class for PlainTable format to write out a key to an output file
+// The class is used in PlainTableBuilder.
+class PlainTableKeyEncoder {
+ public:
+ explicit PlainTableKeyEncoder(EncodingType encoding_type,
+ uint32_t user_key_len,
+ const SliceTransform* prefix_extractor,
+ size_t index_sparseness)
+ : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain),
+ fixed_user_key_len_(user_key_len),
+ prefix_extractor_(prefix_extractor),
+ index_sparseness_((index_sparseness > 1) ? index_sparseness : 1),
+ key_count_for_prefix_(0) {}
+ // key: the key to write out, in the format of internal key.
+ // file: the output file to write out
+ // offset: offset in the file. Needs to be updated after appending bytes
+ // for the key
+ // meta_bytes_buf: buffer for extra meta bytes
+ // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
+ // if meta_bytes_buf is updated.
+ IOStatus AppendKey(const Slice& key, WritableFileWriter* file,
+ uint64_t* offset, char* meta_bytes_buf,
+ size_t* meta_bytes_buf_size);
+
+ // Return actual encoding type to be picked
+ EncodingType GetEncodingType() { return encoding_type_; }
+
+ private:
+ EncodingType encoding_type_;
+ uint32_t fixed_user_key_len_;
+ const SliceTransform* prefix_extractor_;
+ const size_t index_sparseness_;
+ size_t key_count_for_prefix_;
+ IterKey pre_prefix_;
+};
+
+// The class does raw file reads for PlainTableReader.
+// It hides whether it is a mmap-read, or a non-mmap read.
+// The class is implemented in a way to favor the performance of mmap case.
+// The class is used by PlainTableReader.
+class PlainTableFileReader {
+ public:
+ explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
+ : file_info_(_file_info), num_buf_(0) {}
+
+ ~PlainTableFileReader() {
+ // Should fix.
+ status_.PermitUncheckedError();
+ }
+
+ // In mmaped mode, the results point to mmaped area of the file, which
+ // means it is always valid before closing the file.
+ // In non-mmap mode, the results point to an internal buffer. If the caller
+ // makes another read call, the results may not be valid. So callers should
+ // make a copy when needed.
+ // In order to save read calls to files, we keep two internal buffers:
+ // the first read and the most recent read. This is efficient because it
+ // columns these two common use cases:
+ // (1) hash index only identify one location, we read the key to verify
+ // the location, and read key and value if it is the right location.
+ // (2) after hash index checking, we identify two locations (because of
+ // hash bucket conflicts), we binary search the two location to see
+ // which one is what we need and start to read from the location.
+ // These two most common use cases will be covered by the two buffers
+ // so that we don't need to re-read the same location.
+ // Currently we keep a fixed size buffer. If a read doesn't exactly fit
+ // the buffer, we replace the second buffer with the location user reads.
+ //
+ // If return false, status code is stored in status_.
+ bool Read(uint32_t file_offset, uint32_t len, Slice* out) {
+ if (file_info_->is_mmap_mode) {
+ assert(file_offset + len <= file_info_->data_end_offset);
+ *out = Slice(file_info_->file_data.data() + file_offset, len);
+ return true;
+ } else {
+ return ReadNonMmap(file_offset, len, out);
+ }
+ }
+
+ // If return false, status code is stored in status_.
+ bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
+
+ // *bytes_read = 0 means eof. false means failure and status is saved
+ // in status_. Not directly returning Status to save copying status
+ // object to map previous performance of mmap mode.
+ inline bool ReadVarint32(uint32_t offset, uint32_t* output,
+ uint32_t* bytes_read);
+
+ bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
+ uint32_t* bytes_read);
+
+ Status status() const { return status_; }
+
+ const PlainTableReaderFileInfo* file_info() { return file_info_; }
+
+ private:
+ const PlainTableReaderFileInfo* file_info_;
+
+ struct Buffer {
+ Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {}
+ std::unique_ptr<char[]> buf;
+ uint32_t buf_start_offset;
+ uint32_t buf_len;
+ uint32_t buf_capacity;
+ };
+
+ // Keep buffers for two recent reads.
+ std::array<std::unique_ptr<Buffer>, 2> buffers_;
+ uint32_t num_buf_;
+ Status status_;
+
+ Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len);
+};
+
+// A helper class to decode keys from input buffer
+// The class is used by PlainTableBuilder.
+class PlainTableKeyDecoder {
+ public:
+ explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
+ EncodingType encoding_type,
+ uint32_t user_key_len,
+ const SliceTransform* prefix_extractor)
+ : file_reader_(file_info),
+ encoding_type_(encoding_type),
+ prefix_len_(0),
+ fixed_user_key_len_(user_key_len),
+ prefix_extractor_(prefix_extractor),
+ in_prefix_(false) {}
+
+ // Find the next key.
+ // start: char array where the key starts.
+ // limit: boundary of the char array
+ // parsed_key: the output of the result key
+ // internal_key: if not null, fill with the output of the result key in
+ // un-parsed format
+ // bytes_read: how many bytes read from start. Output
+ // seekable: whether key can be read from this place. Used when building
+ // indexes. Output.
+ Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key,
+ Slice* internal_key, Slice* value, uint32_t* bytes_read,
+ bool* seekable = nullptr);
+
+ Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key,
+ Slice* internal_key, uint32_t* bytes_read,
+ bool* seekable = nullptr);
+
+ PlainTableFileReader file_reader_;
+ EncodingType encoding_type_;
+ uint32_t prefix_len_;
+ uint32_t fixed_user_key_len_;
+ Slice saved_user_key_;
+ IterKey cur_key_;
+ const SliceTransform* prefix_extractor_;
+ bool in_prefix_;
+
+ private:
+ Status NextPlainEncodingKey(uint32_t start_offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key, uint32_t* bytes_read,
+ bool* seekable = nullptr);
+ Status NextPrefixEncodingKey(uint32_t start_offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key, uint32_t* bytes_read,
+ bool* seekable = nullptr);
+ Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size,
+ ParsedInternalKey* parsed_key, uint32_t* bytes_read,
+ bool* internal_key_valid, Slice* internal_key);
+ inline Status DecodeSize(uint32_t start_offset,
+ PlainTableEntryType* entry_type, uint32_t* key_size,
+ uint32_t* bytes_read);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE