diff options
Diffstat (limited to 'src/rocksdb/include')
84 files changed, 20461 insertions, 0 deletions
diff --git a/src/rocksdb/include/rocksdb/advanced_options.h b/src/rocksdb/include/rocksdb/advanced_options.h new file mode 100644 index 000000000..a72edbe05 --- /dev/null +++ b/src/rocksdb/include/rocksdb/advanced_options.h @@ -0,0 +1,731 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <memory> + +#include "rocksdb/memtablerep.h" +#include "rocksdb/universal_compaction.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class SliceTransform; +enum CompressionType : unsigned char; +class TablePropertiesCollectorFactory; +class TableFactory; +struct Options; + +enum CompactionStyle : char { + // level based compaction style + kCompactionStyleLevel = 0x0, + // Universal compaction style + // Not supported in ROCKSDB_LITE. + kCompactionStyleUniversal = 0x1, + // FIFO compaction style + // Not supported in ROCKSDB_LITE + kCompactionStyleFIFO = 0x2, + // Disable background compaction. Compaction jobs are submitted + // via CompactFiles(). + // Not supported in ROCKSDB_LITE + kCompactionStyleNone = 0x3, +}; + +// In Level-based compaction, it Determines which file from a level to be +// picked to merge to the next level. We suggest people try +// kMinOverlappingRatio first when you tune your database. +enum CompactionPri : char { + // Slightly prioritize larger files by size compensated by #deletes + kByCompensatedSize = 0x0, + // First compact files whose data's latest update time is oldest. + // Try this if you only update some hot keys in small ranges. + kOldestLargestSeqFirst = 0x1, + // First compact files whose range hasn't been compacted to the next level + // for the longest. If your updates are random across the key space, + // write amplification is slightly better with this option. + kOldestSmallestSeqFirst = 0x2, + // First compact files whose ratio between overlapping size in next level + // and its size is the smallest. It in many cases can optimize write + // amplification. + kMinOverlappingRatio = 0x3, +}; + +struct CompactionOptionsFIFO { + // once the total sum of table files reaches this, we will delete the oldest + // table file + // Default: 1GB + uint64_t max_table_files_size; + + // If true, try to do compaction to compact smaller files into larger ones. + // Minimum files to compact follows options.level0_file_num_compaction_trigger + // and compaction won't trigger if average compact bytes per del file is + // larger than options.write_buffer_size. This is to protect large files + // from being compacted again. + // Default: false; + bool allow_compaction = false; + + CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} + CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction) + : max_table_files_size(_max_table_files_size), + allow_compaction(_allow_compaction) {} +}; + +// Compression options for different compression algorithms like Zlib +struct CompressionOptions { + // RocksDB's generic default compression level. Internally it'll be translated + // to the default compression level specific to the library being used (see + // comment above `ColumnFamilyOptions::compression`). + // + // The default value is the max 16-bit int as it'll be written out in OPTIONS + // file, which should be portable. + const static int kDefaultCompressionLevel = 32767; + + int window_bits; + int level; + int strategy; + + // Maximum size of dictionaries used to prime the compression library. + // Enabling dictionary can improve compression ratios when there are + // repetitions across data blocks. + // + // The dictionary is created by sampling the SST file data. If + // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's + // dictionary generator. Otherwise, the random samples are used directly as + // the dictionary. + // + // When compression dictionary is disabled, we compress and write each block + // before buffering data for the next one. When compression dictionary is + // enabled, we buffer all SST file data in-memory so we can sample it, as data + // can only be compressed and written after the dictionary has been finalized. + // So users of this feature may see increased memory usage. + // + // Default: 0. + uint32_t max_dict_bytes; + + // Maximum size of training data passed to zstd's dictionary trainer. Using + // zstd's dictionary trainer can achieve even better compression ratio + // improvements than using `max_dict_bytes` alone. + // + // The training data will be used to generate a dictionary of max_dict_bytes. + // + // Default: 0. + uint32_t zstd_max_train_bytes; + + // When the compression options are set by the user, it will be set to "true". + // For bottommost_compression_opts, to enable it, user must set enabled=true. + // Otherwise, bottommost compression will use compression_opts as default + // compression options. + // + // For compression_opts, if compression_opts.enabled=false, it is still + // used as compression options for compression process. + // + // Default: false. + bool enabled; + + CompressionOptions() + : window_bits(-14), + level(kDefaultCompressionLevel), + strategy(0), + max_dict_bytes(0), + zstd_max_train_bytes(0), + enabled(false) {} + CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes, + int _zstd_max_train_bytes, bool _enabled) + : window_bits(wbits), + level(_lev), + strategy(_strategy), + max_dict_bytes(_max_dict_bytes), + zstd_max_train_bytes(_zstd_max_train_bytes), + enabled(_enabled) {} +}; + +enum UpdateStatus { // Return status For inplace update callback + UPDATE_FAILED = 0, // Nothing to update + UPDATED_INPLACE = 1, // Value updated inplace + UPDATED = 2, // No inplace update. Merged value set +}; + +struct AdvancedColumnFamilyOptions { + // The maximum number of write buffers that are built up in memory. + // The default and the minimum number is 2, so that when 1 write buffer + // is being flushed to storage, new writes can continue to the other + // write buffer. + // If max_write_buffer_number > 3, writing will be slowed down to + // options.delayed_write_rate if we are writing to the last write buffer + // allowed. + // + // Default: 2 + // + // Dynamically changeable through SetOptions() API + int max_write_buffer_number = 2; + + // The minimum number of write buffers that will be merged together + // before writing to storage. If set to 1, then + // all write buffers are flushed to L0 as individual files and this increases + // read amplification because a get request has to check in all of these + // files. Also, an in-memory merge may result in writing lesser + // data to storage if there are duplicate records in each of these + // individual write buffers. Default: 1 + int min_write_buffer_number_to_merge = 1; + + // DEPRECATED + // The total maximum number of write buffers to maintain in memory including + // copies of buffers that have already been flushed. Unlike + // max_write_buffer_number, this parameter does not affect flushing. + // This parameter is being replaced by max_write_buffer_size_to_maintain. + // If both parameters are set to non-zero values, this parameter will be + // ignored. + int max_write_buffer_number_to_maintain = 0; + + // The total maximum size(bytes) of write buffers to maintain in memory + // including copies of buffers that have already been flushed. This parameter + // only affects trimming of flushed buffers and does not affect flushing. + // This controls the maximum amount of write history that will be available + // in memory for conflict checking when Transactions are used. The actual + // size of write history (flushed Memtables) might be higher than this limit + // if further trimming will reduce write history total size below this + // limit. For example, if max_write_buffer_size_to_maintain is set to 64MB, + // and there are three flushed Memtables, with sizes of 32MB, 20MB, 20MB. + // Because trimming the next Memtable of size 20MB will reduce total memory + // usage to 52MB which is below the limit, RocksDB will stop trimming. + // + // When using an OptimisticTransactionDB: + // If this value is too low, some transactions may fail at commit time due + // to not being able to determine whether there were any write conflicts. + // + // When using a TransactionDB: + // If Transaction::SetSnapshot is used, TransactionDB will read either + // in-memory write buffers or SST files to do write-conflict checking. + // Increasing this value can reduce the number of reads to SST files + // done for conflict detection. + // + // Setting this value to 0 will cause write buffers to be freed immediately + // after they are flushed. If this value is set to -1, + // 'max_write_buffer_number * write_buffer_size' will be used. + // + // Default: + // If using a TransactionDB/OptimisticTransactionDB, the default value will + // be set to the value of 'max_write_buffer_number * write_buffer_size' + // if it is not explicitly set by the user. Otherwise, the default is 0. + int64_t max_write_buffer_size_to_maintain = 0; + + // Allows thread-safe inplace updates. If this is true, there is no way to + // achieve point-in-time consistency using snapshot or iterator (assuming + // concurrent updates). Hence iterator and multi-get will return results + // which are not consistent as of any point-in-time. + // If inplace_callback function is not set, + // Put(key, new_value) will update inplace the existing_value iff + // * key exists in current memtable + // * new sizeof(new_value) <= sizeof(existing_value) + // * existing_value for that key is a put i.e. kTypeValue + // If inplace_callback function is set, check doc for inplace_callback. + // Default: false. + bool inplace_update_support = false; + + // Number of locks used for inplace update + // Default: 10000, if inplace_update_support = true, else 0. + // + // Dynamically changeable through SetOptions() API + size_t inplace_update_num_locks = 10000; + + // existing_value - pointer to previous value (from both memtable and sst). + // nullptr if key doesn't exist + // existing_value_size - pointer to size of existing_value). + // nullptr if key doesn't exist + // delta_value - Delta value to be merged with the existing_value. + // Stored in transaction logs. + // merged_value - Set when delta is applied on the previous value. + + // Applicable only when inplace_update_support is true, + // this callback function is called at the time of updating the memtable + // as part of a Put operation, lets say Put(key, delta_value). It allows the + // 'delta_value' specified as part of the Put operation to be merged with + // an 'existing_value' of the key in the database. + + // If the merged value is smaller in size that the 'existing_value', + // then this function can update the 'existing_value' buffer inplace and + // the corresponding 'existing_value'_size pointer, if it wishes to. + // The callback should return UpdateStatus::UPDATED_INPLACE. + // In this case. (In this case, the snapshot-semantics of the rocksdb + // Iterator is not atomic anymore). + + // If the merged value is larger in size than the 'existing_value' or the + // application does not wish to modify the 'existing_value' buffer inplace, + // then the merged value should be returned via *merge_value. It is set by + // merging the 'existing_value' and the Put 'delta_value'. The callback should + // return UpdateStatus::UPDATED in this case. This merged value will be added + // to the memtable. + + // If merging fails or the application does not wish to take any action, + // then the callback should return UpdateStatus::UPDATE_FAILED. + + // Please remember that the original call from the application is Put(key, + // delta_value). So the transaction log (if enabled) will still contain (key, + // delta_value). The 'merged_value' is not stored in the transaction log. + // Hence the inplace_callback function should be consistent across db reopens. + + // Default: nullptr + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value) = nullptr; + + // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, + // create prefix bloom for memtable with the size of + // write_buffer_size * memtable_prefix_bloom_size_ratio. + // If it is larger than 0.25, it is sanitized to 0.25. + // + // Default: 0 (disable) + // + // Dynamically changeable through SetOptions() API + double memtable_prefix_bloom_size_ratio = 0.0; + + // Enable whole key bloom filter in memtable. Note this will only take effect + // if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering + // can potentially reduce CPU usage for point-look-ups. + // + // Default: false (disable) + // + // Dynamically changeable through SetOptions() API + bool memtable_whole_key_filtering = false; + + // Page size for huge page for the arena used by the memtable. If <=0, it + // won't allocate from huge page but from malloc. + // Users are responsible to reserve huge pages for it to be allocated. For + // example: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + // If there isn't enough free huge page available, it will fall back to + // malloc. + // + // Dynamically changeable through SetOptions() API + size_t memtable_huge_page_size = 0; + + // If non-nullptr, memtable will use the specified function to extract + // prefixes for keys, and for each prefix maintain a hint of insert location + // to reduce CPU usage for inserting keys with the prefix. Keys out of + // domain of the prefix extractor will be insert without using hints. + // + // Currently only the default skiplist based memtable implements the feature. + // All other memtable implementation will ignore the option. It incurs ~250 + // additional bytes of memory overhead to store a hint for each prefix. + // Also concurrent writes (when allow_concurrent_memtable_write is true) will + // ignore the option. + // + // The option is best suited for workloads where keys will likely to insert + // to a location close the last inserted key with the same prefix. + // One example could be inserting keys of the form (prefix + timestamp), + // and keys of the same prefix always comes in with time order. Another + // example would be updating the same key over and over again, in which case + // the prefix can be the key itself. + // + // Default: nullptr (disable) + std::shared_ptr<const SliceTransform> + memtable_insert_with_hint_prefix_extractor = nullptr; + + // Control locality of bloom filter probes to improve CPU cache hit rate. + // This option now only applies to plaintable prefix bloom. This + // optimization is turned off when set to 0, and positive number to turn + // it on. + // Default: 0 + uint32_t bloom_locality = 0; + + // size of one block in arena memory allocation. + // If <= 0, a proper value is automatically calculated (usually 1/8 of + // writer_buffer_size, rounded up to a multiple of 4KB). + // + // There are two additional restriction of the specified size: + // (1) size should be in the range of [4096, 2 << 30] and + // (2) be the multiple of the CPU word (which helps with the memory + // alignment). + // + // We'll automatically check and adjust the size number to make sure it + // conforms to the restrictions. + // + // Default: 0 + // + // Dynamically changeable through SetOptions() API + size_t arena_block_size = 0; + + // Different levels can have different compression policies. There + // are cases where most lower levels would like to use quick compression + // algorithms while the higher levels (which have more data) use + // compression algorithms that have better compression but could + // be slower. This array, if non-empty, should have an entry for + // each level of the database; these override the value specified in + // the previous field 'compression'. + // + // NOTICE if level_compaction_dynamic_level_bytes=true, + // compression_per_level[0] still determines L0, but other elements + // of the array are based on base level (the level L0 files are merged + // to), and may not match the level users see from info log for metadata. + // If L0 files are merged to level-n, then, for i>0, compression_per_level[i] + // determines compaction type for level n+i-1. + // For example, if we have three 5 levels, and we determine to merge L0 + // data to L4 (which means L1..L3 will be empty), then the new files go to + // L4 uses compression type compression_per_level[1]. + // If now L0 is merged to L2. Data goes to L2 will be compressed + // according to compression_per_level[1], L3 using compression_per_level[2] + // and L4 using compression_per_level[3]. Compaction for each level can + // change when data grows. + std::vector<CompressionType> compression_per_level; + + // Number of levels for this database + int num_levels = 7; + + // Soft limit on number of level-0 files. We start slowing down writes at this + // point. A value <0 means that no writing slow down will be triggered by + // number of files in level-0. + // + // Default: 20 + // + // Dynamically changeable through SetOptions() API + int level0_slowdown_writes_trigger = 20; + + // Maximum number of level-0 files. We stop writes at this point. + // + // Default: 36 + // + // Dynamically changeable through SetOptions() API + int level0_stop_writes_trigger = 36; + + // Target file size for compaction. + // target_file_size_base is per-file size for level-1. + // Target file size for level L can be calculated by + // target_file_size_base * (target_file_size_multiplier ^ (L-1)) + // For example, if target_file_size_base is 2MB and + // target_file_size_multiplier is 10, then each file on level-1 will + // be 2MB, and each file on level 2 will be 20MB, + // and each file on level-3 will be 200MB. + // + // Default: 64MB. + // + // Dynamically changeable through SetOptions() API + uint64_t target_file_size_base = 64 * 1048576; + + // By default target_file_size_multiplier is 1, which means + // by default files in different levels will have similar size. + // + // Dynamically changeable through SetOptions() API + int target_file_size_multiplier = 1; + + // If true, RocksDB will pick target size of each level dynamically. + // We will pick a base level b >= 1. L0 will be directly merged into level b, + // instead of always into level 1. Level 1 to b-1 need to be empty. + // We try to pick b and its target size so that + // 1. target size is in the range of + // (max_bytes_for_level_base / max_bytes_for_level_multiplier, + // max_bytes_for_level_base] + // 2. target size of the last level (level num_levels-1) equals to extra size + // of the level. + // At the same time max_bytes_for_level_multiplier and + // max_bytes_for_level_multiplier_additional are still satisfied. + // (When L0 is too large, we make some adjustment. See below.) + // + // With this option on, from an empty DB, we make last level the base level, + // which means merging L0 data into the last level, until it exceeds + // max_bytes_for_level_base. And then we make the second last level to be + // base level, to start to merge L0 data to second last level, with its + // target size to be 1/max_bytes_for_level_multiplier of the last level's + // extra size. After the data accumulates more so that we need to move the + // base level to the third last one, and so on. + // + // For example, assume max_bytes_for_level_multiplier=10, num_levels=6, + // and max_bytes_for_level_base=10MB. + // Target sizes of level 1 to 5 starts with: + // [- - - - 10MB] + // with base level is level. Target sizes of level 1 to 4 are not applicable + // because they will not be used. + // Until the size of Level 5 grows to more than 10MB, say 11MB, we make + // base target to level 4 and now the targets looks like: + // [- - - 1.1MB 11MB] + // While data are accumulated, size targets are tuned based on actual data + // of level 5. When level 5 has 50MB of data, the target is like: + // [- - - 5MB 50MB] + // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep + // level 4 to be the base level, its target size needs to be 10.1MB, which + // doesn't satisfy the target size range. So now we make level 3 the target + // size and the target sizes of the levels look like: + // [- - 1.01MB 10.1MB 101MB] + // In the same way, while level 5 further grows, all levels' targets grow, + // like + // [- - 5MB 50MB 500MB] + // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the + // base level and make levels' target sizes like this: + // [- 1.001MB 10.01MB 100.1MB 1001MB] + // and go on... + // + // By doing it, we give max_bytes_for_level_multiplier a priority against + // max_bytes_for_level_base, for a more predictable LSM tree shape. It is + // useful to limit worse case space amplification. + // + // + // If the compaction from L0 is lagged behind, a special mode will be turned + // on to prioritize write amplification against max_bytes_for_level_multiplier + // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking + // at number of L0 files and total L0 size. If number of L0 files is at least + // the double of level0_file_num_compaction_trigger, or the total size is + // at least max_bytes_for_level_base, this mode is on. The target of L1 grows + // to the actual data size in L0, and then determine the target for each level + // so that each level will have the same level multiplier. + // + // For example, when L0 size is 100MB, the size of last level is 1600MB, + // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10. + // Since L0 size is larger than max_bytes_for_level_base, this is a L0 + // compaction backlogged mode. So that the L1 size is determined to be 100MB. + // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will + // be needed. The level multiplier will be calculated to be 4 and the three + // levels' target to be [100MB, 400MB, 1600MB]. + // + // In this mode, The number of levels will be no more than the normal mode, + // and the level multiplier will be lower. The write amplification will + // likely to be reduced. + // + // + // max_bytes_for_level_multiplier_additional is ignored with this flag on. + // + // Turning this feature on or off for an existing DB can cause unexpected + // LSM tree structure so it's not recommended. + // + // Default: false + bool level_compaction_dynamic_level_bytes = false; + + // Default: 10. + // + // Dynamically changeable through SetOptions() API + double max_bytes_for_level_multiplier = 10; + + // Different max-size multipliers for different levels. + // These are multiplied by max_bytes_for_level_multiplier to arrive + // at the max-size of each level. + // + // Default: 1 + // + // Dynamically changeable through SetOptions() API + std::vector<int> max_bytes_for_level_multiplier_additional = + std::vector<int>(num_levels, 1); + + // We try to limit number of bytes in one compaction to be lower than this + // threshold. But it's not guaranteed. + // Value 0 will be sanitized. + // + // Default: target_file_size_base * 25 + // + // Dynamically changeable through SetOptions() API + uint64_t max_compaction_bytes = 0; + + // All writes will be slowed down to at least delayed_write_rate if estimated + // bytes needed to be compaction exceed this threshold. + // + // Default: 64GB + // + // Dynamically changeable through SetOptions() API + uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull; + + // All writes are stopped if estimated bytes needed to be compaction exceed + // this threshold. + // + // Default: 256GB + // + // Dynamically changeable through SetOptions() API + uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull; + + // The compaction style. Default: kCompactionStyleLevel + CompactionStyle compaction_style = kCompactionStyleLevel; + + // If level compaction_style = kCompactionStyleLevel, for each level, + // which files are prioritized to be picked to compact. + // Default: kMinOverlappingRatio + CompactionPri compaction_pri = kMinOverlappingRatio; + + // The options needed to support Universal Style compactions + // + // Dynamically changeable through SetOptions() API + // Dynamic change example: + // SetOptions("compaction_options_universal", "{size_ratio=2;}") + CompactionOptionsUniversal compaction_options_universal; + + // The options for FIFO compaction style + // + // Dynamically changeable through SetOptions() API + // Dynamic change example: + // SetOptions("compaction_options_fifo", "{max_table_files_size=100;}") + CompactionOptionsFIFO compaction_options_fifo; + + // An iteration->Next() sequentially skips over keys with the same + // user-key unless this option is set. This number specifies the number + // of keys (with the same userkey) that will be sequentially + // skipped before a reseek is issued. + // + // Default: 8 + // + // Dynamically changeable through SetOptions() API + uint64_t max_sequential_skip_in_iterations = 8; + + // This is a factory that provides MemTableRep objects. + // Default: a factory that provides a skip-list-based implementation of + // MemTableRep. + std::shared_ptr<MemTableRepFactory> memtable_factory = + std::shared_ptr<SkipListFactory>(new SkipListFactory); + + // Block-based table related options are moved to BlockBasedTableOptions. + // Related options that were originally here but now moved include: + // no_block_cache + // block_cache + // block_cache_compressed + // block_size + // block_size_deviation + // block_restart_interval + // filter_policy + // whole_key_filtering + // If you'd like to customize some of these options, you will need to + // use NewBlockBasedTableFactory() to construct a new table factory. + + // This option allows user to collect their own interested statistics of + // the tables. + // Default: empty vector -- no user-defined statistics collection will be + // performed. + typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>> + TablePropertiesCollectorFactories; + TablePropertiesCollectorFactories table_properties_collector_factories; + + // Maximum number of successive merge operations on a key in the memtable. + // + // When a merge operation is added to the memtable and the maximum number of + // successive merges is reached, the value of the key will be calculated and + // inserted into the memtable instead of the merge operation. This will + // ensure that there are never more than max_successive_merges merge + // operations in the memtable. + // + // Default: 0 (disabled) + // + // Dynamically changeable through SetOptions() API + size_t max_successive_merges = 0; + + // This flag specifies that the implementation should optimize the filters + // mainly for cases where keys are found rather than also optimize for keys + // missed. This would be used in cases where the application knows that + // there are very few misses or the performance in the case of misses is not + // important. + // + // For now, this flag allows us to not store filters for the last level i.e + // the largest level which contains data of the LSM store. For keys which + // are hits, the filters in this level are not useful because we will search + // for the data anyway. NOTE: the filters in other levels are still useful + // even for key hit because they tell us whether to look in that level or go + // to the higher level. + // + // Default: false + bool optimize_filters_for_hits = false; + + // After writing every SST file, reopen it and read all the keys. + // + // Default: false + // + // Dynamically changeable through SetOptions() API + bool paranoid_file_checks = false; + + // In debug mode, RocksDB run consistency checks on the LSM every time the LSM + // change (Flush, Compaction, AddFile). These checks are disabled in release + // mode, use this option to enable them in release mode as well. + // Default: false + bool force_consistency_checks = false; + + // Measure IO stats in compactions and flushes, if true. + // + // Default: false + // + // Dynamically changeable through SetOptions() API + bool report_bg_io_stats = false; + + // Files older than TTL will go through the compaction process. + // Pre-req: This needs max_open_files to be set to -1. + // In Level: Non-bottom-level files older than TTL will go through the + // compation process. + // In FIFO: Files older than TTL will be deleted. + // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60 + // In FIFO, this option will have the same meaning as + // periodic_compaction_seconds. Whichever stricter will be used. + // 0 means disabling. + // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to + // pick default. + // + // Default: 30 days for leveled compaction + block based table. disable + // otherwise. + // + // Dynamically changeable through SetOptions() API + uint64_t ttl = 0xfffffffffffffffe; + + // Files older than this value will be picked up for compaction, and + // re-written to the same level as they were before. + // + // A file's age is computed by looking at file_creation_time or creation_time + // table properties in order, if they have valid non-zero values; if not, the + // age is based on the file's last modified time (given by the underlying + // Env). + // + // Supported in Level and FIFO compaction. + // In FIFO compaction, this option has the same meaning as TTL and whichever + // stricter will be used. + // Pre-req: max_open_file == -1. + // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60 + // + // Values: + // 0: Turn off Periodic compactions. + // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature + // as needed. For now, RocksDB will change this value to 30 days + // (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction + // process at least once every 30 days if not compacted sooner. + // In FIFO compaction, since the option has the same meaning as ttl, + // when this value is left default, and ttl is left to 0, 30 days will be + // used. Otherwise, min(ttl, periodic_compaction_seconds) will be used. + // + // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune) + // + // Dynamically changeable through SetOptions() API + uint64_t periodic_compaction_seconds = 0xfffffffffffffffe; + + // If this option is set then 1 in N blocks are compressed + // using a fast (lz4) and slow (zstd) compression algorithm. + // The compressibility is reported as stats and the stored + // data is left uncompressed (unless compression is also requested). + uint64_t sample_for_compression = 0; + + // Create ColumnFamilyOptions with default values for all fields + AdvancedColumnFamilyOptions(); + // Create ColumnFamilyOptions from Options + explicit AdvancedColumnFamilyOptions(const Options& options); + + // ---------------- OPTIONS NOT SUPPORTED ANYMORE ---------------- + + // NOT SUPPORTED ANYMORE + // This does not do anything anymore. + int max_mem_compaction_level; + + // NOT SUPPORTED ANYMORE -- this options is no longer used + // Puts are delayed to options.delayed_write_rate when any level has a + // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0. + // + // Default: 0 (disabled) + // + // Dynamically changeable through SetOptions() API + double soft_rate_limit = 0.0; + + // NOT SUPPORTED ANYMORE -- this options is no longer used + double hard_rate_limit = 0.0; + + // NOT SUPPORTED ANYMORE -- this options is no longer used + unsigned int rate_limit_delay_max_milliseconds = 100; + + // NOT SUPPORTED ANYMORE + // Does not have any effect. + bool purge_redundant_kvs_while_flush = true; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/c.h b/src/rocksdb/include/rocksdb/c.h new file mode 100644 index 000000000..dafefceb3 --- /dev/null +++ b/src/rocksdb/include/rocksdb/c.h @@ -0,0 +1,1801 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +/* Copyright (c) 2011 The LevelDB Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. + + C bindings for rocksdb. May be useful as a stable ABI that can be + used by programs that keep rocksdb in a shared library, or for + a JNI api. + + Does not support: + . getters for the option types + . custom comparators that implement key shortening + . capturing post-write-snapshot + . custom iter, db, env, cache implementations using just the C bindings + + Some conventions: + + (1) We expose just opaque struct pointers and functions to clients. + This allows us to change internal representations without having to + recompile clients. + + (2) For simplicity, there is no equivalent to the Slice type. Instead, + the caller has to pass the pointer and length as separate + arguments. + + (3) Errors are represented by a null-terminated c string. NULL + means no error. All operations that can raise an error are passed + a "char** errptr" as the last argument. One of the following must + be true on entry: + *errptr == NULL + *errptr points to a malloc()ed null-terminated error message + On success, a leveldb routine leaves *errptr unchanged. + On failure, leveldb frees the old value of *errptr and + set *errptr to a malloc()ed error message. + + (4) Bools have the type unsigned char (0 == false; rest == true) + + (5) All of the pointer arguments must be non-NULL. +*/ + +#pragma once + +#ifdef _WIN32 +#ifdef ROCKSDB_DLL +#ifdef ROCKSDB_LIBRARY_EXPORTS +#define ROCKSDB_LIBRARY_API __declspec(dllexport) +#else +#define ROCKSDB_LIBRARY_API __declspec(dllimport) +#endif +#else +#define ROCKSDB_LIBRARY_API +#endif +#else +#define ROCKSDB_LIBRARY_API +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdarg.h> +#include <stddef.h> +#include <stdint.h> + +/* Exported types */ + +typedef struct rocksdb_t rocksdb_t; +typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t; +typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t; +typedef struct rocksdb_restore_options_t rocksdb_restore_options_t; +typedef struct rocksdb_cache_t rocksdb_cache_t; +typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t; +typedef struct rocksdb_compactionfiltercontext_t + rocksdb_compactionfiltercontext_t; +typedef struct rocksdb_compactionfilterfactory_t + rocksdb_compactionfilterfactory_t; +typedef struct rocksdb_comparator_t rocksdb_comparator_t; +typedef struct rocksdb_dbpath_t rocksdb_dbpath_t; +typedef struct rocksdb_env_t rocksdb_env_t; +typedef struct rocksdb_fifo_compaction_options_t rocksdb_fifo_compaction_options_t; +typedef struct rocksdb_filelock_t rocksdb_filelock_t; +typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t; +typedef struct rocksdb_flushoptions_t rocksdb_flushoptions_t; +typedef struct rocksdb_iterator_t rocksdb_iterator_t; +typedef struct rocksdb_logger_t rocksdb_logger_t; +typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t; +typedef struct rocksdb_options_t rocksdb_options_t; +typedef struct rocksdb_compactoptions_t rocksdb_compactoptions_t; +typedef struct rocksdb_block_based_table_options_t + rocksdb_block_based_table_options_t; +typedef struct rocksdb_cuckoo_table_options_t + rocksdb_cuckoo_table_options_t; +typedef struct rocksdb_randomfile_t rocksdb_randomfile_t; +typedef struct rocksdb_readoptions_t rocksdb_readoptions_t; +typedef struct rocksdb_seqfile_t rocksdb_seqfile_t; +typedef struct rocksdb_slicetransform_t rocksdb_slicetransform_t; +typedef struct rocksdb_snapshot_t rocksdb_snapshot_t; +typedef struct rocksdb_writablefile_t rocksdb_writablefile_t; +typedef struct rocksdb_writebatch_t rocksdb_writebatch_t; +typedef struct rocksdb_writebatch_wi_t rocksdb_writebatch_wi_t; +typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t; +typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t; +typedef struct rocksdb_livefiles_t rocksdb_livefiles_t; +typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t; +typedef struct rocksdb_envoptions_t rocksdb_envoptions_t; +typedef struct rocksdb_ingestexternalfileoptions_t rocksdb_ingestexternalfileoptions_t; +typedef struct rocksdb_sstfilewriter_t rocksdb_sstfilewriter_t; +typedef struct rocksdb_ratelimiter_t rocksdb_ratelimiter_t; +typedef struct rocksdb_perfcontext_t rocksdb_perfcontext_t; +typedef struct rocksdb_pinnableslice_t rocksdb_pinnableslice_t; +typedef struct rocksdb_transactiondb_options_t rocksdb_transactiondb_options_t; +typedef struct rocksdb_transactiondb_t rocksdb_transactiondb_t; +typedef struct rocksdb_transaction_options_t rocksdb_transaction_options_t; +typedef struct rocksdb_optimistictransactiondb_t + rocksdb_optimistictransactiondb_t; +typedef struct rocksdb_optimistictransaction_options_t + rocksdb_optimistictransaction_options_t; +typedef struct rocksdb_transaction_t rocksdb_transaction_t; +typedef struct rocksdb_checkpoint_t rocksdb_checkpoint_t; +typedef struct rocksdb_wal_iterator_t rocksdb_wal_iterator_t; +typedef struct rocksdb_wal_readoptions_t rocksdb_wal_readoptions_t; +typedef struct rocksdb_memory_consumers_t rocksdb_memory_consumers_t; +typedef struct rocksdb_memory_usage_t rocksdb_memory_usage_t; + +/* DB operations */ + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, const char* name, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_with_ttl( + const rocksdb_options_t* options, const char* name, int ttl, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only( + const rocksdb_options_t* options, const char* name, + unsigned char error_if_log_file_exist, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary( + const rocksdb_options_t* options, const char* name, + const char* secondary_path, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open( + const rocksdb_options_t* options, const char* path, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup( + rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup_flush( + rocksdb_backup_engine_t* be, rocksdb_t* db, unsigned char flush_before_backup, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_purge_old_backups( + rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_restore_options_t* +rocksdb_restore_options_create(); +extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy( + rocksdb_restore_options_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files( + rocksdb_restore_options_t* opt, int v); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be, + uint32_t backup_id, char** errptr); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backup_engine_restore_db_from_latest_backup( + rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, + const rocksdb_restore_options_t* restore_options, char** errptr); + +extern ROCKSDB_LIBRARY_API const rocksdb_backup_engine_info_t* +rocksdb_backup_engine_get_backup_info(rocksdb_backup_engine_t* be); + +extern ROCKSDB_LIBRARY_API int rocksdb_backup_engine_info_count( + const rocksdb_backup_engine_info_t* info); + +extern ROCKSDB_LIBRARY_API int64_t +rocksdb_backup_engine_info_timestamp(const rocksdb_backup_engine_info_t* info, + int index); + +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_backup_engine_info_backup_id(const rocksdb_backup_engine_info_t* info, + int index); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_backup_engine_info_size(const rocksdb_backup_engine_info_t* info, + int index); + +extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_number_files( + const rocksdb_backup_engine_info_t* info, int index); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_info_destroy( + const rocksdb_backup_engine_info_t* info); + +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_close( + rocksdb_backup_engine_t* be); + +extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t* +rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_create( + rocksdb_checkpoint_t* checkpoint, const char* checkpoint_dir, + uint64_t log_size_for_flush, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_object_destroy( + rocksdb_checkpoint_t* checkpoint); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* +rocksdb_open_for_read_only_column_families( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, + unsigned char error_if_log_file_exist, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary_column_families( + const rocksdb_options_t* options, const char* name, + const char* secondary_path, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** colummn_family_handles, char** errptr); + +extern ROCKSDB_LIBRARY_API char** rocksdb_list_column_families( + const rocksdb_options_t* options, const char* name, size_t* lencf, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_list_column_families_destroy( + char** list, size_t len); + +extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t* +rocksdb_create_column_family(rocksdb_t* db, + const rocksdb_options_t* column_family_options, + const char* column_family_name, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_drop_column_family( + rocksdb_t* db, rocksdb_column_family_handle_t* handle, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_column_family_handle_destroy( + rocksdb_column_family_handle_t*); + +extern ROCKSDB_LIBRARY_API void rocksdb_close(rocksdb_t* db); + +extern ROCKSDB_LIBRARY_API void rocksdb_put( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_put_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_range_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* start_key, + size_t start_key_len, const char* end_key, size_t end_key_len, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_merge( + rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_merge_cf( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_write( + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr); + +/* Returns NULL if not found. A malloc()ed array otherwise. + Stores the length of the array in *vallen. */ +extern ROCKSDB_LIBRARY_API char* rocksdb_get( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, size_t* vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_get_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr); + +// if values_list[i] == NULL and errs[i] == NULL, +// then we got status.IsNotFound(), which we will not return. +// all errors except status status.ok() and status.IsNotFound() are returned. +// +// errs, values_list and values_list_sizes must be num_keys in length, +// allocated by the caller. +// errs is a list of strings as opposed to the conventional one error, +// where errs[i] is the status for retrieval of keys_list[i]. +// each non-NULL errs entry is a malloc()ed, null terminated string. +// each non-NULL values_list entry is a malloc()ed array, with +// the length for each stored in values_list_sizes[i]. +extern ROCKSDB_LIBRARY_API void rocksdb_multi_get( + rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys, + const char* const* keys_list, const size_t* keys_list_sizes, + char** values_list, size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, const rocksdb_readoptions_t* options); + +extern ROCKSDB_LIBRARY_API rocksdb_wal_iterator_t* rocksdb_get_updates_since( + rocksdb_t* db, uint64_t seq_number, + const rocksdb_wal_readoptions_t* options, + char** errptr +); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family); + +extern ROCKSDB_LIBRARY_API void rocksdb_create_iterators( + rocksdb_t *db, rocksdb_readoptions_t* opts, + rocksdb_column_family_handle_t** column_families, + rocksdb_iterator_t** iterators, size_t size, char** errptr); + +extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db); + +extern ROCKSDB_LIBRARY_API void rocksdb_release_snapshot( + rocksdb_t* db, const rocksdb_snapshot_t* snapshot); + +/* Returns NULL if property name is unknown. + Else returns a pointer to a malloc()-ed null-terminated value. */ +extern ROCKSDB_LIBRARY_API char* rocksdb_property_value(rocksdb_t* db, + const char* propname); +/* returns 0 on success, -1 otherwise */ +int rocksdb_property_int( + rocksdb_t* db, + const char* propname, uint64_t *out_val); + +/* returns 0 on success, -1 otherwise */ +int rocksdb_property_int_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* propname, uint64_t *out_val); + +extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* propname); + +extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes( + rocksdb_t* db, int num_ranges, const char* const* range_start_key, + const size_t* range_start_key_len, const char* const* range_limit_key, + const size_t* range_limit_key_len, uint64_t* sizes); + +extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + int num_ranges, const char* const* range_start_key, + const size_t* range_start_key_len, const char* const* range_limit_key, + const size_t* range_limit_key_len, uint64_t* sizes); + +extern ROCKSDB_LIBRARY_API void rocksdb_compact_range(rocksdb_t* db, + const char* start_key, + size_t start_key_len, + const char* limit_key, + size_t limit_key_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_opt( + rocksdb_t* db, rocksdb_compactoptions_t* opt, const char* start_key, + size_t start_key_len, const char* limit_key, size_t limit_key_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf_opt( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + rocksdb_compactoptions_t* opt, const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_file(rocksdb_t* db, + const char* name); + +extern ROCKSDB_LIBRARY_API const rocksdb_livefiles_t* rocksdb_livefiles( + rocksdb_t* db); + +extern ROCKSDB_LIBRARY_API void rocksdb_flush( + rocksdb_t* db, const rocksdb_flushoptions_t* options, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_flush_cf( + rocksdb_t* db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t* column_family, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_disable_file_deletions(rocksdb_t* db, + char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_enable_file_deletions( + rocksdb_t* db, unsigned char force, char** errptr); + +/* Management operations */ + +extern ROCKSDB_LIBRARY_API void rocksdb_destroy_db( + const rocksdb_options_t* options, const char* name, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_repair_db( + const rocksdb_options_t* options, const char* name, char** errptr); + +/* Iterator */ + +extern ROCKSDB_LIBRARY_API void rocksdb_iter_destroy(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_iter_valid( + const rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_first(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_last(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek(rocksdb_iterator_t*, + const char* k, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_for_prev(rocksdb_iterator_t*, + const char* k, + size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_next(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_prev(rocksdb_iterator_t*); +extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_key( + const rocksdb_iterator_t*, size_t* klen); +extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_value( + const rocksdb_iterator_t*, size_t* vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_iter_get_error( + const rocksdb_iterator_t*, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_wal_iter_valid( + const rocksdb_wal_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_status (const rocksdb_wal_iterator_t* iter, char** errptr) ; +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) ; +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_get_latest_sequence_number (rocksdb_t *db); +extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) ; + +/* Write batch */ + +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create(); +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create_from( + const char* rep, size_t size); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_destroy( + rocksdb_writebatch_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_clear(rocksdb_writebatch_t*); +extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_count(rocksdb_writebatch_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put(rocksdb_writebatch_t*, + const char* key, + size_t klen, + const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_cf( + rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv( + rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge(rocksdb_writebatch_t*, + const char* key, + size_t klen, + const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge_cf( + rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev( + rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete(rocksdb_writebatch_t*, + const char* key, + size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf( + rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev( + rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range( + rocksdb_writebatch_t* b, const char* start_key, size_t start_key_len, + const char* end_key, size_t end_key_len); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* end_key, + size_t end_key_len); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev( + rocksdb_writebatch_t* b, int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_log_data( + rocksdb_writebatch_t*, const char* blob, size_t len); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate( + rocksdb_writebatch_t*, void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)); +extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_data( + rocksdb_writebatch_t*, size_t* size); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_set_save_point( + rocksdb_writebatch_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_rollback_to_save_point( + rocksdb_writebatch_t*, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_pop_save_point( + rocksdb_writebatch_t*, char** errptr); + +/* Write batch with index */ + +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create( + size_t reserved_bytes, + unsigned char overwrite_keys); +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create_from( + const char* rep, size_t size); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_destroy( + rocksdb_writebatch_wi_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_clear(rocksdb_writebatch_wi_t*); +extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_wi_count(rocksdb_writebatch_wi_t* b); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put(rocksdb_writebatch_wi_t*, + const char* key, + size_t klen, + const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_cf( + rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv( + rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge(rocksdb_writebatch_wi_t*, + const char* key, + size_t klen, + const char* val, + size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge_cf( + rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev( + rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t*, + const char* key, + size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_cf( + rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev( + rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes); +// DO NOT USE - rocksdb_writebatch_wi_delete_range is not yet supported +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range( + rocksdb_writebatch_wi_t* b, const char* start_key, size_t start_key_len, + const char* end_key, size_t end_key_len); +// DO NOT USE - rocksdb_writebatch_wi_delete_range_cf is not yet supported +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* end_key, + size_t end_key_len); +// DO NOT USE - rocksdb_writebatch_wi_delete_rangev is not yet supported +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev( + rocksdb_writebatch_wi_t* b, int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes); +// DO NOT USE - rocksdb_writebatch_wi_delete_rangev_cf is not yet supported +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_log_data( + rocksdb_writebatch_wi_t*, const char* blob, size_t len); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_iterate( + rocksdb_writebatch_wi_t* b, + void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)); +extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_wi_data( + rocksdb_writebatch_wi_t* b, + size_t* size); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_set_save_point( + rocksdb_writebatch_wi_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_rollback_to_save_point( + rocksdb_writebatch_wi_t*, char** errptr); +extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch( + rocksdb_writebatch_wi_t* wbwi, + const rocksdb_options_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr); +extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_cf( + rocksdb_writebatch_wi_t* wbwi, + const rocksdb_options_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, + size_t* vallen, + char** errptr); +extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db( + rocksdb_writebatch_wi_t* wbwi, + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr); +extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db_cf( + rocksdb_writebatch_wi_t* wbwi, + rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, + size_t* vallen, + char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_write_writebatch_wi( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_wi_t* wbwi, + char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base( + rocksdb_writebatch_wi_t* wbwi, + rocksdb_iterator_t* base_iterator); +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf( + rocksdb_writebatch_wi_t* wbwi, + rocksdb_iterator_t* base_iterator, + rocksdb_column_family_handle_t* cf); + +/* Block based table options */ + +extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t* +rocksdb_block_based_options_create(); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_destroy( + rocksdb_block_based_table_options_t* options); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_size( + rocksdb_block_based_table_options_t* options, size_t block_size); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_block_size_deviation( + rocksdb_block_based_table_options_t* options, int block_size_deviation); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_block_restart_interval( + rocksdb_block_based_table_options_t* options, int block_restart_interval); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_index_block_restart_interval( + rocksdb_block_based_table_options_t* options, int index_block_restart_interval); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_metadata_block_size( + rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_partition_filters( + rocksdb_block_based_table_options_t* options, unsigned char partition_filters); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_use_delta_encoding( + rocksdb_block_based_table_options_t* options, unsigned char use_delta_encoding); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_filter_policy( + rocksdb_block_based_table_options_t* options, + rocksdb_filterpolicy_t* filter_policy); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_no_block_cache( + rocksdb_block_based_table_options_t* options, unsigned char no_block_cache); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_cache( + rocksdb_block_based_table_options_t* options, rocksdb_cache_t* block_cache); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_block_cache_compressed( + rocksdb_block_based_table_options_t* options, + rocksdb_cache_t* block_cache_compressed); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_whole_key_filtering( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_format_version( + rocksdb_block_based_table_options_t*, int); +enum { + rocksdb_block_based_table_index_type_binary_search = 0, + rocksdb_block_based_table_index_type_hash_search = 1, + rocksdb_block_based_table_index_type_two_level_index_search = 2, +}; +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_index_type( + rocksdb_block_based_table_options_t*, int); // uses one of the above enums +enum { + rocksdb_block_based_table_data_block_index_type_binary_search = 0, + rocksdb_block_based_table_data_block_index_type_binary_search_and_hash = 1, +}; +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_data_block_index_type( + rocksdb_block_based_table_options_t*, int); // uses one of the above enums +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_data_block_hash_ratio( + rocksdb_block_based_table_options_t* options, double v); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_hash_index_allow_collision( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_cache_index_and_filter_blocks( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_pin_top_level_index_and_filter( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory( + rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options); + +/* Cuckoo table options */ + +extern ROCKSDB_LIBRARY_API rocksdb_cuckoo_table_options_t* +rocksdb_cuckoo_options_create(); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_destroy( + rocksdb_cuckoo_table_options_t* options); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_hash_ratio( + rocksdb_cuckoo_table_options_t* options, double v); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_max_search_depth( + rocksdb_cuckoo_table_options_t* options, uint32_t v); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_cuckoo_block_size( + rocksdb_cuckoo_table_options_t* options, uint32_t v); +extern ROCKSDB_LIBRARY_API void +rocksdb_cuckoo_options_set_identity_as_first_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v); +extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_use_module_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_cuckoo_table_factory( + rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options); + +/* Options */ +extern ROCKSDB_LIBRARY_API void rocksdb_set_options( + rocksdb_t* db, int count, const char* const keys[], const char* const values[], char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_set_options_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create(); +extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_increase_parallelism( + rocksdb_options_t* opt, int total_threads); +extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_for_point_lookup( + rocksdb_options_t* opt, uint64_t block_cache_size_mb); +extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_level_style_compaction( + rocksdb_options_t* opt, uint64_t memtable_memory_budget); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_optimize_universal_style_compaction( + rocksdb_options_t* opt, uint64_t memtable_memory_budget); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_allow_ingest_behind(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter( + rocksdb_options_t*, rocksdb_compactionfilter_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter_factory( + rocksdb_options_t*, rocksdb_compactionfilterfactory_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_compaction_readahead_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_comparator( + rocksdb_options_t*, rocksdb_comparator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_merge_operator( + rocksdb_options_t*, rocksdb_mergeoperator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_uint64add_merge_operator( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_per_level( + rocksdb_options_t* opt, int* level_values, size_t num_levels); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_create_if_missing( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_create_missing_column_families(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_error_if_exists( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_paranoid_checks( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths(rocksdb_options_t*, + const rocksdb_dbpath_t** path_values, + size_t num_paths); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_env(rocksdb_options_t*, + rocksdb_env_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log(rocksdb_options_t*, + rocksdb_logger_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log_level( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_write_buffer_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_open_files( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_file_opening_threads( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_total_wal_size( + rocksdb_options_t* opt, uint64_t n); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options( + rocksdb_options_t*, int, int, int, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor( + rocksdb_options_t*, rocksdb_slicetransform_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_num_levels( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_level0_file_num_compaction_trigger(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_mem_compaction_level( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_base( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t*, double); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_bytes_for_level_multiplier_additional( + rocksdb_options_t*, int* level_values, size_t num_levels); +extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, + unsigned char val); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt, unsigned char val); + +/* returns a pointer to a malloc()-ed, null terminated string */ +extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*, + int); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*, + int64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions( + rocksdb_options_t*, uint32_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_compactions( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_base_background_compactions( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_flushes( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_log_file_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_rate_limit( + rocksdb_options_t*, double); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_rate_limit( + rocksdb_options_t*, double); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_pending_compaction_bytes_limit( + rocksdb_options_t* opt, size_t v); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_pending_compaction_bytes_limit( + rocksdb_options_t* opt, size_t v); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_rate_limit_delay_max_milliseconds(rocksdb_options_t*, + unsigned int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_manifest_file_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_table_cache_numshardbits( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_table_cache_remove_scan_count_limit(rocksdb_options_t*, + int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_arena_block_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_fsync( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_log_dir( + rocksdb_options_t*, const char*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_dir(rocksdb_options_t*, + const char*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_ttl_seconds( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_size_limit_MB( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manifest_preallocation_size( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_reads( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_use_direct_io_for_flush_and_compaction(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_log_error_on_recovery( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_dump_period_sec( + rocksdb_options_t*, unsigned int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_advise_random_on_open( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_adaptive_mutex( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_bytes_per_sync( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_enable_write_thread_adaptive_yield(rocksdb_options_t*, + unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_sequential_skip_in_iterations(rocksdb_options_t*, + uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_disable_auto_compactions( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_optimize_filters_for_hits( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_delete_obsolete_files_period_micros(rocksdb_options_t*, + uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_prepare_for_bulk_load( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_vector_rep( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_prefix_bloom_size_ratio( + rocksdb_options_t*, double); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_compaction_bytes( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_skip_list_rep( + rocksdb_options_t*, size_t, int32_t, int32_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_link_list_rep( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_plain_table_factory( + rocksdb_options_t*, uint32_t, int, double, size_t); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_level_to_compress( + rocksdb_options_t* opt, int level); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_huge_page_size( + rocksdb_options_t*, size_t); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_successive_merges( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bloom_locality( + rocksdb_options_t*, uint32_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_support( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_num_locks( + rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_report_bg_io_stats( + rocksdb_options_t*, int); + +enum { + rocksdb_tolerate_corrupted_tail_records_recovery = 0, + rocksdb_absolute_consistency_recovery = 1, + rocksdb_point_in_time_recovery = 2, + rocksdb_skip_any_corrupted_records_recovery = 3 +}; +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_recovery_mode( + rocksdb_options_t*, int); + +enum { + rocksdb_no_compression = 0, + rocksdb_snappy_compression = 1, + rocksdb_zlib_compression = 2, + rocksdb_bz2_compression = 3, + rocksdb_lz4_compression = 4, + rocksdb_lz4hc_compression = 5, + rocksdb_xpress_compression = 6, + rocksdb_zstd_compression = 7 +}; +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression( + rocksdb_options_t*, int); + +enum { + rocksdb_level_compaction = 0, + rocksdb_universal_compaction = 1, + rocksdb_fifo_compaction = 2 +}; +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_style( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_universal_compaction_options( + rocksdb_options_t*, rocksdb_universal_compaction_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_fifo_compaction_options( + rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_ratelimiter( + rocksdb_options_t* opt, rocksdb_ratelimiter_t* limiter); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_atomic_flush( + rocksdb_options_t* opt, unsigned char); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache( + rocksdb_options_t* opt, rocksdb_cache_t* cache +); + +/* RateLimiter */ +extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create( + int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness); +extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t*); + +/* PerfContext */ +enum { + rocksdb_uninitialized = 0, + rocksdb_disable = 1, + rocksdb_enable_count = 2, + rocksdb_enable_time_except_for_mutex = 3, + rocksdb_enable_time = 4, + rocksdb_out_of_bounds = 5 +}; + +enum { + rocksdb_user_key_comparison_count = 0, + rocksdb_block_cache_hit_count, + rocksdb_block_read_count, + rocksdb_block_read_byte, + rocksdb_block_read_time, + rocksdb_block_checksum_time, + rocksdb_block_decompress_time, + rocksdb_get_read_bytes, + rocksdb_multiget_read_bytes, + rocksdb_iter_read_bytes, + rocksdb_internal_key_skipped_count, + rocksdb_internal_delete_skipped_count, + rocksdb_internal_recent_skipped_count, + rocksdb_internal_merge_count, + rocksdb_get_snapshot_time, + rocksdb_get_from_memtable_time, + rocksdb_get_from_memtable_count, + rocksdb_get_post_process_time, + rocksdb_get_from_output_files_time, + rocksdb_seek_on_memtable_time, + rocksdb_seek_on_memtable_count, + rocksdb_next_on_memtable_count, + rocksdb_prev_on_memtable_count, + rocksdb_seek_child_seek_time, + rocksdb_seek_child_seek_count, + rocksdb_seek_min_heap_time, + rocksdb_seek_max_heap_time, + rocksdb_seek_internal_seek_time, + rocksdb_find_next_user_entry_time, + rocksdb_write_wal_time, + rocksdb_write_memtable_time, + rocksdb_write_delay_time, + rocksdb_write_pre_and_post_process_time, + rocksdb_db_mutex_lock_nanos, + rocksdb_db_condition_wait_nanos, + rocksdb_merge_operator_time_nanos, + rocksdb_read_index_block_nanos, + rocksdb_read_filter_block_nanos, + rocksdb_new_table_block_iter_nanos, + rocksdb_new_table_iterator_nanos, + rocksdb_block_seek_nanos, + rocksdb_find_table_nanos, + rocksdb_bloom_memtable_hit_count, + rocksdb_bloom_memtable_miss_count, + rocksdb_bloom_sst_hit_count, + rocksdb_bloom_sst_miss_count, + rocksdb_key_lock_wait_time, + rocksdb_key_lock_wait_count, + rocksdb_env_new_sequential_file_nanos, + rocksdb_env_new_random_access_file_nanos, + rocksdb_env_new_writable_file_nanos, + rocksdb_env_reuse_writable_file_nanos, + rocksdb_env_new_random_rw_file_nanos, + rocksdb_env_new_directory_nanos, + rocksdb_env_file_exists_nanos, + rocksdb_env_get_children_nanos, + rocksdb_env_get_children_file_attributes_nanos, + rocksdb_env_delete_file_nanos, + rocksdb_env_create_dir_nanos, + rocksdb_env_create_dir_if_missing_nanos, + rocksdb_env_delete_dir_nanos, + rocksdb_env_get_file_size_nanos, + rocksdb_env_get_file_modification_time_nanos, + rocksdb_env_rename_file_nanos, + rocksdb_env_link_file_nanos, + rocksdb_env_lock_file_nanos, + rocksdb_env_unlock_file_nanos, + rocksdb_env_new_logger_nanos, + rocksdb_total_metric_count = 68 +}; + +extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int); +extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create(); +extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset( + rocksdb_perfcontext_t* context); +extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report( + rocksdb_perfcontext_t* context, unsigned char exclude_zero_counters); +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_perfcontext_metric( + rocksdb_perfcontext_t* context, int metric); +extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_destroy( + rocksdb_perfcontext_t* context); + +/* Compaction Filter */ + +extern ROCKSDB_LIBRARY_API rocksdb_compactionfilter_t* +rocksdb_compactionfilter_create( + void* state, void (*destructor)(void*), + unsigned char (*filter)(void*, int level, const char* key, + size_t key_length, const char* existing_value, + size_t value_length, char** new_value, + size_t* new_value_length, + unsigned char* value_changed), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_set_ignore_snapshots( + rocksdb_compactionfilter_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_destroy( + rocksdb_compactionfilter_t*); + +/* Compaction Filter Context */ + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactionfiltercontext_is_full_compaction( + rocksdb_compactionfiltercontext_t* context); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactionfiltercontext_is_manual_compaction( + rocksdb_compactionfiltercontext_t* context); + +/* Compaction Filter Factory */ + +extern ROCKSDB_LIBRARY_API rocksdb_compactionfilterfactory_t* +rocksdb_compactionfilterfactory_create( + void* state, void (*destructor)(void*), + rocksdb_compactionfilter_t* (*create_compaction_filter)( + void*, rocksdb_compactionfiltercontext_t* context), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilterfactory_destroy( + rocksdb_compactionfilterfactory_t*); + +/* Comparator */ + +extern ROCKSDB_LIBRARY_API rocksdb_comparator_t* rocksdb_comparator_create( + void* state, void (*destructor)(void*), + int (*compare)(void*, const char* a, size_t alen, const char* b, + size_t blen), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API void rocksdb_comparator_destroy( + rocksdb_comparator_t*); + +/* Filter policy */ + +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* rocksdb_filterpolicy_create( + void* state, void (*destructor)(void*), + char* (*create_filter)(void*, const char* const* key_array, + const size_t* key_length_array, int num_keys, + size_t* filter_length), + unsigned char (*key_may_match)(void*, const char* key, size_t length, + const char* filter, size_t filter_length), + void (*delete_filter)(void*, const char* filter, size_t filter_length), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API void rocksdb_filterpolicy_destroy( + rocksdb_filterpolicy_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* +rocksdb_filterpolicy_create_bloom(int bits_per_key); +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* +rocksdb_filterpolicy_create_bloom_full(int bits_per_key); + +/* Merge Operator */ + +extern ROCKSDB_LIBRARY_API rocksdb_mergeoperator_t* +rocksdb_mergeoperator_create( + void* state, void (*destructor)(void*), + char* (*full_merge)(void*, const char* key, size_t key_length, + const char* existing_value, + size_t existing_value_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + char* (*partial_merge)(void*, const char* key, size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + void (*delete_value)(void*, const char* value, size_t value_length), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API void rocksdb_mergeoperator_destroy( + rocksdb_mergeoperator_t*); + +/* Read options */ + +extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create(); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_destroy( + rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t*, const rocksdb_snapshot_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_upper_bound( + rocksdb_readoptions_t*, const char* key, size_t keylen); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_lower_bound( + rocksdb_readoptions_t*, const char* key, size_t keylen); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_read_tier( + rocksdb_readoptions_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing( + rocksdb_readoptions_t*, unsigned char); +// The functionality that this option controlled has been removed. +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_managed( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size( + rocksdb_readoptions_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_prefix_same_as_start( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_pin_data( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_total_order_seek( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_max_skippable_internal_keys( + rocksdb_readoptions_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_background_purge_on_iterator_cleanup( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_ignore_range_deletions( + rocksdb_readoptions_t*, unsigned char); + +/* Write options */ + +extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t* +rocksdb_writeoptions_create(); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_destroy( + rocksdb_writeoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_disable_WAL( + rocksdb_writeoptions_t* opt, int disable); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_ignore_missing_column_families( + rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_no_slowdown( + rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_low_pri( + rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_writeoptions_set_memtable_insert_hint_per_batch(rocksdb_writeoptions_t*, + unsigned char); + +/* Compact range options */ + +extern ROCKSDB_LIBRARY_API rocksdb_compactoptions_t* +rocksdb_compactoptions_create(); +extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_destroy( + rocksdb_compactoptions_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_compactoptions_set_exclusive_manual_compaction( + rocksdb_compactoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_compactoptions_set_bottommost_level_compaction( + rocksdb_compactoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_change_level( + rocksdb_compactoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_target_level( + rocksdb_compactoptions_t*, int); + +/* Flush options */ + +extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t* +rocksdb_flushoptions_create(); +extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_destroy( + rocksdb_flushoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait( + rocksdb_flushoptions_t*, unsigned char); + +/* Cache */ + +extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru( + size_t capacity); +extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API void rocksdb_cache_set_capacity( + rocksdb_cache_t* cache, size_t capacity); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_usage(rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache); + +/* DBPath */ + +extern ROCKSDB_LIBRARY_API rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, uint64_t target_size); +extern ROCKSDB_LIBRARY_API void rocksdb_dbpath_destroy(rocksdb_dbpath_t*); + +/* Env */ + +extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env(); +extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env(); +extern ROCKSDB_LIBRARY_API void rocksdb_env_set_background_threads( + rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API void +rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env); + +extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create(); +extern ROCKSDB_LIBRARY_API void rocksdb_envoptions_destroy( + rocksdb_envoptions_t* opt); + +/* SstFile */ + +extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t* +rocksdb_sstfilewriter_create(const rocksdb_envoptions_t* env, + const rocksdb_options_t* io_options); +extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t* +rocksdb_sstfilewriter_create_with_comparator( + const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options, + const rocksdb_comparator_t* comparator); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_open( + rocksdb_sstfilewriter_t* writer, const char* name, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_add( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + const char* val, size_t vallen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_put( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + const char* val, size_t vallen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_merge( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + const char* val, size_t vallen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete( + rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen, + char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_finish( + rocksdb_sstfilewriter_t* writer, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_file_size( + rocksdb_sstfilewriter_t* writer, uint64_t* file_size); +extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_destroy( + rocksdb_sstfilewriter_t* writer); + +extern ROCKSDB_LIBRARY_API rocksdb_ingestexternalfileoptions_t* +rocksdb_ingestexternalfileoptions_create(); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_move_files( + rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_snapshot_consistency( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char snapshot_consistency); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_allow_global_seqno( + rocksdb_ingestexternalfileoptions_t* opt, unsigned char allow_global_seqno); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_allow_blocking_flush( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char allow_blocking_flush); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_ingest_behind( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char ingest_behind); +extern ROCKSDB_LIBRARY_API void rocksdb_ingestexternalfileoptions_destroy( + rocksdb_ingestexternalfileoptions_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file( + rocksdb_t* db, const char* const* file_list, const size_t list_len, + const rocksdb_ingestexternalfileoptions_t* opt, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* handle, + const char* const* file_list, const size_t list_len, + const rocksdb_ingestexternalfileoptions_t* opt, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_try_catch_up_with_primary( + rocksdb_t* db, char** errptr); + +/* SliceTransform */ + +extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* +rocksdb_slicetransform_create( + void* state, void (*destructor)(void*), + char* (*transform)(void*, const char* key, size_t length, + size_t* dst_length), + unsigned char (*in_domain)(void*, const char* key, size_t length), + unsigned char (*in_range)(void*, const char* key, size_t length), + const char* (*name)(void*)); +extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* + rocksdb_slicetransform_create_fixed_prefix(size_t); +extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* +rocksdb_slicetransform_create_noop(); +extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy( + rocksdb_slicetransform_t*); + +/* Universal Compaction options */ + +enum { + rocksdb_similar_size_compaction_stop_style = 0, + rocksdb_total_size_compaction_stop_style = 1 +}; + +extern ROCKSDB_LIBRARY_API rocksdb_universal_compaction_options_t* +rocksdb_universal_compaction_options_create(); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t* +rocksdb_fifo_compaction_options_create(); +extern ROCKSDB_LIBRARY_API void +rocksdb_fifo_compaction_options_set_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size); +extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy( + rocksdb_fifo_compaction_options_t* fifo_opts); + +extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_count( + const rocksdb_livefiles_t*); +extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_name( + const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_level( + const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_livefiles_size(const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey( + const rocksdb_livefiles_t*, int index, size_t* size); +extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey( + const rocksdb_livefiles_t*, int index, size_t* size); +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_livefiles_entries( + const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_livefiles_deletions( + const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy( + const rocksdb_livefiles_t*); + +/* Utility Helpers */ + +extern ROCKSDB_LIBRARY_API void rocksdb_get_options_from_string( + const rocksdb_options_t* base_options, const char* opts_str, + rocksdb_options_t* new_options, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range( + rocksdb_t* db, const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr); + +/* Transactions */ + +extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t* +rocksdb_transactiondb_create_column_family( + rocksdb_transactiondb_t* txn_db, + const rocksdb_options_t* column_family_options, + const char* column_family_name, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open( + const rocksdb_options_t* options, + const rocksdb_transactiondb_options_t* txn_db_options, const char* name, + char** errptr); + +rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families( + const rocksdb_options_t* options, + const rocksdb_transactiondb_options_t* txn_db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr); + +extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* +rocksdb_transactiondb_create_snapshot(rocksdb_transactiondb_t* txn_db); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_release_snapshot( + rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot); + +extern ROCKSDB_LIBRARY_API rocksdb_transaction_t* rocksdb_transaction_begin( + rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* write_options, + const rocksdb_transaction_options_t* txn_options, + rocksdb_transaction_t* old_txn); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_commit( + rocksdb_transaction_t* txn, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback( + rocksdb_transaction_t* txn, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_savepoint( + rocksdb_transaction_t* txn); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback_to_savepoint( + rocksdb_transaction_t* txn, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_destroy( + rocksdb_transaction_t* txn); + +// This snapshot should be freed using rocksdb_free +extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* +rocksdb_transaction_get_snapshot(rocksdb_transaction_t* txn); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + size_t* vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, unsigned char exclusive, + char** errptr); + +char* rocksdb_transaction_get_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + size_t* vlen, unsigned char exclusive, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put( + rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val, + size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put_cf( + rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + const char* key, size_t klen, const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, const char* val, size_t vallen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_write( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t *batch, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge( + rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val, + size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge_cf( + rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + const char* key, size_t klen, const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + const char* val, size_t vlen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete( + rocksdb_transaction_t* txn, const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete_cf( + rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + const char* key, size_t klen, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_transaction_create_iterator(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_transaction_create_iterator_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_transactiondb_create_iterator(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options); + +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_transactiondb_create_iterator_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_close( + rocksdb_transactiondb_t* txn_db); + +extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t* +rocksdb_transactiondb_checkpoint_object_create(rocksdb_transactiondb_t* txn_db, + char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t* +rocksdb_optimistictransactiondb_open(const rocksdb_options_t* options, + const char* name, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t* +rocksdb_optimistictransactiondb_open_column_families( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_t* +rocksdb_optimistictransactiondb_get_base_db( + rocksdb_optimistictransactiondb_t* otxn_db); + +extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close_base_db( + rocksdb_t* base_db); + +extern ROCKSDB_LIBRARY_API rocksdb_transaction_t* +rocksdb_optimistictransaction_begin( + rocksdb_optimistictransactiondb_t* otxn_db, + const rocksdb_writeoptions_t* write_options, + const rocksdb_optimistictransaction_options_t* otxn_options, + rocksdb_transaction_t* old_txn); + +extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close( + rocksdb_optimistictransactiondb_t* otxn_db); + +/* Transaction Options */ + +extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_options_t* +rocksdb_transactiondb_options_create(); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_destroy( + rocksdb_transactiondb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_max_num_locks( + rocksdb_transactiondb_options_t* opt, int64_t max_num_locks); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_num_stripes( + rocksdb_transactiondb_options_t* opt, size_t num_stripes); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transactiondb_options_set_transaction_lock_timeout( + rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transactiondb_options_set_default_lock_timeout( + rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout); + +extern ROCKSDB_LIBRARY_API rocksdb_transaction_options_t* +rocksdb_transaction_options_create(); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_destroy( + rocksdb_transaction_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_set_snapshot( + rocksdb_transaction_options_t* opt, unsigned char v); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_deadlock_detect( + rocksdb_transaction_options_t* opt, unsigned char v); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_lock_timeout( + rocksdb_transaction_options_t* opt, int64_t lock_timeout); + +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_expiration( + rocksdb_transaction_options_t* opt, int64_t expiration); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transaction_options_set_deadlock_detect_depth( + rocksdb_transaction_options_t* opt, int64_t depth); + +extern ROCKSDB_LIBRARY_API void +rocksdb_transaction_options_set_max_write_batch_size( + rocksdb_transaction_options_t* opt, size_t size); + +extern ROCKSDB_LIBRARY_API rocksdb_optimistictransaction_options_t* +rocksdb_optimistictransaction_options_create(); + +extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransaction_options_destroy( + rocksdb_optimistictransaction_options_t* opt); + +extern ROCKSDB_LIBRARY_API void +rocksdb_optimistictransaction_options_set_set_snapshot( + rocksdb_optimistictransaction_options_t* opt, unsigned char v); + +// referring to convention (3), this should be used by client +// to free memory that was malloc()ed +extern ROCKSDB_LIBRARY_API void rocksdb_free(void* ptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_pinnableslice_destroy( + rocksdb_pinnableslice_t* v); +extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnableslice_value( + const rocksdb_pinnableslice_t* t, size_t* vlen); + +extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t* + rocksdb_memory_consumers_create(); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db( + rocksdb_memory_consumers_t* consumers, rocksdb_t* db); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache( + rocksdb_memory_consumers_t* consumers, rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_destroy( + rocksdb_memory_consumers_t* consumers); +extern ROCKSDB_LIBRARY_API rocksdb_memory_usage_t* +rocksdb_approximate_memory_usage_create(rocksdb_memory_consumers_t* consumers, + char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_approximate_memory_usage_destroy( + rocksdb_memory_usage_t* usage); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_mem_table_total( + rocksdb_memory_usage_t* memory_usage); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_mem_table_unflushed( + rocksdb_memory_usage_t* memory_usage); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_mem_table_readers_total( + rocksdb_memory_usage_t* memory_usage); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_cache_total( + rocksdb_memory_usage_t* memory_usage); + +#ifdef __cplusplus +} /* end extern "C" */ +#endif diff --git a/src/rocksdb/include/rocksdb/cache.h b/src/rocksdb/include/rocksdb/cache.h new file mode 100644 index 000000000..77ddf525d --- /dev/null +++ b/src/rocksdb/include/rocksdb/cache.h @@ -0,0 +1,278 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Cache is an interface that maps keys to values. It has internal +// synchronization and may be safely accessed concurrently from +// multiple threads. It may automatically evict entries to make room +// for new entries. Values have a specified charge against the cache +// capacity. For example, a cache where the values are variable +// length strings, may use the length of the string as the charge for +// the string. +// +// A builtin cache implementation with a least-recently-used eviction +// policy is provided. Clients may use their own implementations if +// they want something more sophisticated (like scan-resistance, a +// custom eviction policy, variable cache sizing, etc.) + +#pragma once + +#include <stdint.h> +#include <memory> +#include <string> +#include "rocksdb/memory_allocator.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; + +extern const bool kDefaultToAdaptiveMutex; + +enum CacheMetadataChargePolicy { + kDontChargeCacheMetadata, + kFullChargeCacheMetadata +}; +const CacheMetadataChargePolicy kDefaultCacheMetadataChargePolicy = + kFullChargeCacheMetadata; + +struct LRUCacheOptions { + // Capacity of the cache. + size_t capacity = 0; + + // Cache is sharded into 2^num_shard_bits shards, + // by hash of key. Refer to NewLRUCache for further + // information. + int num_shard_bits = -1; + + // If strict_capacity_limit is set, + // insert to the cache will fail when cache is full. + bool strict_capacity_limit = false; + + // Percentage of cache reserved for high priority entries. + // If greater than zero, the LRU list will be split into a high-pri + // list and a low-pri list. High-pri entries will be insert to the + // tail of high-pri list, while low-pri entries will be first inserted to + // the low-pri list (the midpoint). This is refered to as + // midpoint insertion strategy to make entries never get hit in cache + // age out faster. + // + // See also + // BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority. + double high_pri_pool_ratio = 0.5; + + // If non-nullptr will use this allocator instead of system allocator when + // allocating memory for cache blocks. Call this method before you start using + // the cache! + // + // Caveat: when the cache is used as block cache, the memory allocator is + // ignored when dealing with compression libraries that allocate memory + // internally (currently only XPRESS). + std::shared_ptr<MemoryAllocator> memory_allocator; + + // Whether to use adaptive mutexes for cache shards. Note that adaptive + // mutexes need to be supported by the platform in order for this to have any + // effect. The default value is true if RocksDB is compiled with + // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise. + bool use_adaptive_mutex = kDefaultToAdaptiveMutex; + + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy; + + LRUCacheOptions() {} + LRUCacheOptions(size_t _capacity, int _num_shard_bits, + bool _strict_capacity_limit, double _high_pri_pool_ratio, + std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr, + bool _use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy _metadata_charge_policy = + kDefaultCacheMetadataChargePolicy) + : capacity(_capacity), + num_shard_bits(_num_shard_bits), + strict_capacity_limit(_strict_capacity_limit), + high_pri_pool_ratio(_high_pri_pool_ratio), + memory_allocator(std::move(_memory_allocator)), + use_adaptive_mutex(_use_adaptive_mutex), + metadata_charge_policy(_metadata_charge_policy) {} +}; + +// Create a new cache with a fixed size capacity. The cache is sharded +// to 2^num_shard_bits shards, by hash of the key. The total capacity +// is divided and evenly assigned to each shard. If strict_capacity_limit +// is set, insert to the cache will fail when cache is full. User can also +// set percentage of the cache reserves for high priority entries via +// high_pri_pool_pct. +// num_shard_bits = -1 means it is automatically determined: every shard +// will be at least 512KB and number of shard bits will not exceed 6. +extern std::shared_ptr<Cache> NewLRUCache( + size_t capacity, int num_shard_bits = -1, + bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5, + std::shared_ptr<MemoryAllocator> memory_allocator = nullptr, + bool use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy); + +extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts); + +// Similar to NewLRUCache, but create a cache based on CLOCK algorithm with +// better concurrent performance in some cases. See util/clock_cache.cc for +// more detail. +// +// Return nullptr if it is not supported. +extern std::shared_ptr<Cache> NewClockCache( + size_t capacity, int num_shard_bits = -1, + bool strict_capacity_limit = false, + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy); +class Cache { + public: + // Depending on implementation, cache entries with high priority could be less + // likely to get evicted than low priority entries. + enum class Priority { HIGH, LOW }; + + Cache(std::shared_ptr<MemoryAllocator> allocator = nullptr) + : memory_allocator_(std::move(allocator)) {} + // No copying allowed + Cache(const Cache&) = delete; + Cache& operator=(const Cache&) = delete; + + // Destroys all existing entries by calling the "deleter" + // function that was passed via the Insert() function. + // + // @See Insert + virtual ~Cache() {} + + // Opaque handle to an entry stored in the cache. + struct Handle {}; + + // The type of the Cache + virtual const char* Name() const = 0; + + // Insert a mapping from key->value into the cache and assign it + // the specified charge against the total cache capacity. + // If strict_capacity_limit is true and cache reaches its full capacity, + // return Status::Incomplete. + // + // If handle is not nullptr, returns a handle that corresponds to the + // mapping. The caller must call this->Release(handle) when the returned + // mapping is no longer needed. In case of error caller is responsible to + // cleanup the value (i.e. calling "deleter"). + // + // If handle is nullptr, it is as if Release is called immediately after + // insert. In case of error value will be cleanup. + // + // When the inserted entry is no longer needed, the key and + // value will be passed to "deleter". + virtual Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) = 0; + + // If the cache has no mapping for "key", returns nullptr. + // + // Else return a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + // If stats is not nullptr, relative tickers could be used inside the + // function. + virtual Handle* Lookup(const Slice& key, Statistics* stats = nullptr) = 0; + + // Increments the reference count for the handle if it refers to an entry in + // the cache. Returns true if refcount was incremented; otherwise, returns + // false. + // REQUIRES: handle must have been returned by a method on *this. + virtual bool Ref(Handle* handle) = 0; + + /** + * Release a mapping returned by a previous Lookup(). A released entry might + * still remain in cache in case it is later looked up by others. If + * force_erase is set then it also erase it from the cache if there is no + * other reference to it. Erasing it should call the deleter function that + * was provided when the + * entry was inserted. + * + * Returns true if the entry was also erased. + */ + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual bool Release(Handle* handle, bool force_erase = false) = 0; + + // Return the value encapsulated in a handle returned by a + // successful Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void* Value(Handle* handle) = 0; + + // If the cache contains entry for key, erase it. Note that the + // underlying entry will be kept around until all existing handles + // to it have been released. + virtual void Erase(const Slice& key) = 0; + // Return a new numeric id. May be used by multiple clients who are + // sharding the same cache to partition the key space. Typically the + // client will allocate a new id at startup and prepend the id to + // its cache keys. + virtual uint64_t NewId() = 0; + + // sets the maximum configured capacity of the cache. When the new + // capacity is less than the old capacity and the existing usage is + // greater than new capacity, the implementation will do its best job to + // purge the released entries from the cache in order to lower the usage + virtual void SetCapacity(size_t capacity) = 0; + + // Set whether to return error on insertion when cache reaches its full + // capacity. + virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0; + + // Get the flag whether to return error on insertion when cache reaches its + // full capacity. + virtual bool HasStrictCapacityLimit() const = 0; + + // returns the maximum configured capacity of the cache + virtual size_t GetCapacity() const = 0; + + // returns the memory size for the entries residing in the cache. + virtual size_t GetUsage() const = 0; + + // returns the memory size for a specific entry in the cache. + virtual size_t GetUsage(Handle* handle) const = 0; + + // returns the memory size for the entries in use by the system + virtual size_t GetPinnedUsage() const = 0; + + // returns the charge for the specific entry in the cache. + virtual size_t GetCharge(Handle* handle) const = 0; + + // Call this on shutdown if you want to speed it up. Cache will disown + // any underlying data and will not free it on delete. This call will leak + // memory - call this only if you're shutting down the process. + // Any attempts of using cache after this call will fail terribly. + // Always delete the DB object before calling this method! + virtual void DisownData(){ + // default implementation is noop + } + + // Apply callback to all entries in the cache + // If thread_safe is true, it will also lock the accesses. Otherwise, it will + // access the cache without the lock held + virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe) = 0; + + // Remove all entries. + // Prerequisite: no entry is referenced. + virtual void EraseUnRefEntries() = 0; + + virtual std::string GetPrintableOptions() const { return ""; } + + MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); } + + private: + std::shared_ptr<MemoryAllocator> memory_allocator_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/cleanable.h b/src/rocksdb/include/rocksdb/cleanable.h new file mode 100644 index 000000000..b6a70ea64 --- /dev/null +++ b/src/rocksdb/include/rocksdb/cleanable.h @@ -0,0 +1,71 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Cleanable { + public: + Cleanable(); + // No copy constructor and copy assignment allowed. + Cleanable(Cleanable&) = delete; + Cleanable& operator=(Cleanable&) = delete; + + ~Cleanable(); + + // Move constructor and move assignment is allowed. + Cleanable(Cleanable&&); + Cleanable& operator=(Cleanable&&); + + // Clients are allowed to register function/arg1/arg2 triples that + // will be invoked when this iterator is destroyed. + // + // Note that unlike all of the preceding methods, this method is + // not abstract and therefore clients should not override it. + typedef void (*CleanupFunction)(void* arg1, void* arg2); + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + void DelegateCleanupsTo(Cleanable* other); + // DoCleanup and also resets the pointers for reuse + inline void Reset() { + DoCleanup(); + cleanup_.function = nullptr; + cleanup_.next = nullptr; + } + + protected: + struct Cleanup { + CleanupFunction function; + void* arg1; + void* arg2; + Cleanup* next; + }; + Cleanup cleanup_; + // It also becomes the owner of c + void RegisterCleanup(Cleanup* c); + + private: + // Performs all the cleanups. It does not reset the pointers. Making it + // private + // to prevent misuse + inline void DoCleanup() { + if (cleanup_.function != nullptr) { + (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); + for (Cleanup* c = cleanup_.next; c != nullptr;) { + (*c->function)(c->arg1, c->arg2); + Cleanup* next = c->next; + delete c; + c = next; + } + } + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/compaction_filter.h b/src/rocksdb/include/rocksdb/compaction_filter.h new file mode 100644 index 000000000..976507831 --- /dev/null +++ b/src/rocksdb/include/rocksdb/compaction_filter.h @@ -0,0 +1,212 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2013 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <cassert> +#include <memory> +#include <string> +#include <vector> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class SliceTransform; + +// Context information of a compaction run +struct CompactionFilterContext { + // Does this compaction run include all data files + bool is_full_compaction; + // Is this compaction requested by the client (true), + // or is it occurring as an automatic compaction process + bool is_manual_compaction; +}; + +// CompactionFilter allows an application to modify/delete a key-value at +// the time of compaction. + +class CompactionFilter { + public: + enum ValueType { + kValue, + kMergeOperand, + kBlobIndex, // used internally by BlobDB. + }; + + enum class Decision { + kKeep, + kRemove, + kChangeValue, + kRemoveAndSkipUntil, + }; + + enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError }; + + // Context information of a compaction run + struct Context { + // Does this compaction run include all data files + bool is_full_compaction; + // Is this compaction requested by the client (true), + // or is it occurring as an automatic compaction process + bool is_manual_compaction; + // Which column family this compaction is for. + uint32_t column_family_id; + }; + + virtual ~CompactionFilter() {} + + // The compaction process invokes this + // method for kv that is being compacted. A return value + // of false indicates that the kv should be preserved in the + // output of this compaction run and a return value of true + // indicates that this key-value should be removed from the + // output of the compaction. The application can inspect + // the existing value of the key and make decision based on it. + // + // Key-Values that are results of merge operation during compaction are not + // passed into this function. Currently, when you have a mix of Put()s and + // Merge()s on a same key, we only guarantee to process the merge operands + // through the compaction filters. Put()s might be processed, or might not. + // + // When the value is to be preserved, the application has the option + // to modify the existing_value and pass it back through new_value. + // value_changed needs to be set to true in this case. + // + // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a + // DB* object) will not guarantee to preserve the state of the DB with + // CompactionFilter. Data seen from a snapshot might disppear after a + // compaction finishes. If you use snapshots, think twice about whether you + // want to use compaction filter and whether you are using it in a safe way. + // + // If multithreaded compaction is being used *and* a single CompactionFilter + // instance was supplied via Options::compaction_filter, this method may be + // called from different threads concurrently. The application must ensure + // that the call is thread-safe. + // + // If the CompactionFilter was created by a factory, then it will only ever + // be used by a single thread that is doing the compaction run, and this + // call does not need to be thread-safe. However, multiple filters may be + // in existence and operating concurrently. + virtual bool Filter(int /*level*/, const Slice& /*key*/, + const Slice& /*existing_value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const { + return false; + } + + // The compaction process invokes this method on every merge operand. If this + // method returns true, the merge operand will be ignored and not written out + // in the compaction output + // + // Note: If you are using a TransactionDB, it is not recommended to implement + // FilterMergeOperand(). If a Merge operation is filtered out, TransactionDB + // may not realize there is a write conflict and may allow a Transaction to + // Commit that should have failed. Instead, it is better to implement any + // Merge filtering inside the MergeOperator. + virtual bool FilterMergeOperand(int /*level*/, const Slice& /*key*/, + const Slice& /*operand*/) const { + return false; + } + + // An extended API. Called for both values and merge operands. + // Allows changing value and skipping ranges of keys. + // The default implementation uses Filter() and FilterMergeOperand(). + // If you're overriding this method, no need to override the other two. + // `value_type` indicates whether this key-value corresponds to a normal + // value (e.g. written with Put()) or a merge operand (written with Merge()). + // + // Possible return values: + // * kKeep - keep the key-value pair. + // * kRemove - remove the key-value pair or merge operand. + // * kChangeValue - keep the key and change the value/operand to *new_value. + // * kRemoveAndSkipUntil - remove this key-value pair, and also remove + // all key-value pairs with key in [key, *skip_until). This range + // of keys will be skipped without reading, potentially saving some + // IO operations compared to removing the keys one by one. + // + // *skip_until <= key is treated the same as Decision::kKeep + // (since the range [key, *skip_until) is empty). + // + // Caveats: + // - The keys are skipped even if there are snapshots containing them, + // i.e. values removed by kRemoveAndSkipUntil can disappear from a + // snapshot - beware if you're using TransactionDB or + // DB::GetSnapshot(). + // - If value for a key was overwritten or merged into (multiple Put()s + // or Merge()s), and compaction filter skips this key with + // kRemoveAndSkipUntil, it's possible that it will remove only + // the new value, exposing the old value that was supposed to be + // overwritten. + // - Doesn't work with PlainTableFactory in prefix mode. + // - If you use kRemoveAndSkipUntil, consider also reducing + // compaction_readahead_size option. + // + // Note: If you are using a TransactionDB, it is not recommended to filter + // out or modify merge operands (ValueType::kMergeOperand). + // If a merge operation is filtered out, TransactionDB may not realize there + // is a write conflict and may allow a Transaction to Commit that should have + // failed. Instead, it is better to implement any Merge filtering inside the + // MergeOperator. + virtual Decision FilterV2(int level, const Slice& key, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const { + switch (value_type) { + case ValueType::kValue: { + bool value_changed = false; + bool rv = Filter(level, key, existing_value, new_value, &value_changed); + if (rv) { + return Decision::kRemove; + } + return value_changed ? Decision::kChangeValue : Decision::kKeep; + } + case ValueType::kMergeOperand: { + bool rv = FilterMergeOperand(level, key, existing_value); + return rv ? Decision::kRemove : Decision::kKeep; + } + case ValueType::kBlobIndex: + return Decision::kKeep; + } + assert(false); + return Decision::kKeep; + } + + // Internal (BlobDB) use only. Do not override in application code. + virtual BlobDecision PrepareBlobOutput(const Slice& /* key */, + const Slice& /* existing_value */, + std::string* /* new_value */) const { + return BlobDecision::kKeep; + } + + // This function is deprecated. Snapshots will always be ignored for + // compaction filters, because we realized that not ignoring snapshots doesn't + // provide the gurantee we initially thought it would provide. Repeatable + // reads will not be guaranteed anyway. If you override the function and + // returns false, we will fail the compaction. + virtual bool IgnoreSnapshots() const { return true; } + + // Returns a name that identifies this compaction filter. + // The name will be printed to LOG file on start up for diagnosis. + virtual const char* Name() const = 0; +}; + +// Each compaction will create a new CompactionFilter allowing the +// application to know about different compactions +class CompactionFilterFactory { + public: + virtual ~CompactionFilterFactory() {} + + virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter( + const CompactionFilter::Context& context) = 0; + + // Returns a name that identifies this compaction filter factory. + virtual const char* Name() const = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/compaction_job_stats.h b/src/rocksdb/include/rocksdb/compaction_job_stats.h new file mode 100644 index 000000000..8949b43e5 --- /dev/null +++ b/src/rocksdb/include/rocksdb/compaction_job_stats.h @@ -0,0 +1,96 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include <stddef.h> +#include <stdint.h> +#include <string> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +struct CompactionJobStats { + CompactionJobStats() { Reset(); } + void Reset(); + // Aggregate the CompactionJobStats from another instance with this one + void Add(const CompactionJobStats& stats); + + // the elapsed time of this compaction in microseconds. + uint64_t elapsed_micros; + + // the elapsed CPU time of this compaction in microseconds. + uint64_t cpu_micros; + + // the number of compaction input records. + uint64_t num_input_records; + // the number of compaction input files. + size_t num_input_files; + // the number of compaction input files at the output level. + size_t num_input_files_at_output_level; + + // the number of compaction output records. + uint64_t num_output_records; + // the number of compaction output files. + size_t num_output_files; + + // true if the compaction is a manual compaction + bool is_manual_compaction; + + // the size of the compaction input in bytes. + uint64_t total_input_bytes; + // the size of the compaction output in bytes. + uint64_t total_output_bytes; + + // number of records being replaced by newer record associated with same key. + // this could be a new value or a deletion entry for that key so this field + // sums up all updated and deleted keys + uint64_t num_records_replaced; + + // the sum of the uncompressed input keys in bytes. + uint64_t total_input_raw_key_bytes; + // the sum of the uncompressed input values in bytes. + uint64_t total_input_raw_value_bytes; + + // the number of deletion entries before compaction. Deletion entries + // can disappear after compaction because they expired + uint64_t num_input_deletion_records; + // number of deletion records that were found obsolete and discarded + // because it is not possible to delete any more keys with this entry + // (i.e. all possible deletions resulting from it have been completed) + uint64_t num_expired_deletion_records; + + // number of corrupt keys (ParseInternalKey returned false when applied to + // the key) encountered and written out. + uint64_t num_corrupt_keys; + + // Following counters are only populated if + // options.report_bg_io_stats = true; + + // Time spent on file's Append() call. + uint64_t file_write_nanos; + + // Time spent on sync file range. + uint64_t file_range_sync_nanos; + + // Time spent on file fsync. + uint64_t file_fsync_nanos; + + // Time spent on preparing file write (fallocate, etc) + uint64_t file_prepare_write_nanos; + + // 0-terminated strings storing the first 8 bytes of the smallest and + // largest key in the output. + static const size_t kMaxPrefixLength = 8; + + std::string smallest_output_key_prefix; + std::string largest_output_key_prefix; + + // number of single-deletes which do not meet a put + uint64_t num_single_del_fallthru; + + // number of single-deletes which meet something other than a put + uint64_t num_single_del_mismatch; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/comparator.h b/src/rocksdb/include/rocksdb/comparator.h new file mode 100644 index 000000000..76981d108 --- /dev/null +++ b/src/rocksdb/include/rocksdb/comparator.h @@ -0,0 +1,122 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <string> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +// A Comparator object provides a total order across slices that are +// used as keys in an sstable or a database. A Comparator implementation +// must be thread-safe since rocksdb may invoke its methods concurrently +// from multiple threads. +class Comparator { + public: + Comparator() : timestamp_size_(0) {} + + Comparator(size_t ts_sz) : timestamp_size_(ts_sz) {} + + Comparator(const Comparator& orig) : timestamp_size_(orig.timestamp_size_) {} + + Comparator& operator=(const Comparator& rhs) { + if (this != &rhs) { + timestamp_size_ = rhs.timestamp_size_; + } + return *this; + } + + virtual ~Comparator() {} + + static const char* Type() { return "Comparator"; } + // Three-way comparison. Returns value: + // < 0 iff "a" < "b", + // == 0 iff "a" == "b", + // > 0 iff "a" > "b" + virtual int Compare(const Slice& a, const Slice& b) const = 0; + + // Compares two slices for equality. The following invariant should always + // hold (and is the default implementation): + // Equal(a, b) iff Compare(a, b) == 0 + // Overwrite only if equality comparisons can be done more efficiently than + // three-way comparisons. + virtual bool Equal(const Slice& a, const Slice& b) const { + return Compare(a, b) == 0; + } + + // The name of the comparator. Used to check for comparator + // mismatches (i.e., a DB created with one comparator is + // accessed using a different comparator. + // + // The client of this package should switch to a new name whenever + // the comparator implementation changes in a way that will cause + // the relative ordering of any two keys to change. + // + // Names starting with "rocksdb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Advanced functions: these are used to reduce the space requirements + // for internal data structures like index blocks. + + // If *start < limit, changes *start to a short string in [start,limit). + // Simple comparator implementations may return with *start unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const = 0; + + // Changes *key to a short string >= *key. + // Simple comparator implementations may return with *key unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortSuccessor(std::string* key) const = 0; + + // if it is a wrapped comparator, may return the root one. + // return itself it is not wrapped. + virtual const Comparator* GetRootComparator() const { return this; } + + // given two keys, determine if t is the successor of s + virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/, + const Slice& /*t*/) const { + return false; + } + + // return true if two keys with different byte sequences can be regarded + // as equal by this comparator. + // The major use case is to determine if DataBlockHashIndex is compatible + // with the customized comparator. + virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; } + + inline size_t timestamp_size() const { return timestamp_size_; } + + virtual int CompareWithoutTimestamp(const Slice& a, const Slice& b) const { + return Compare(a, b); + } + + virtual int CompareTimestamp(const Slice& /*ts1*/, + const Slice& /*ts2*/) const { + return 0; + } + + private: + size_t timestamp_size_; +}; + +// Return a builtin comparator that uses lexicographic byte-wise +// ordering. The result remains the property of this module and +// must not be deleted. +extern const Comparator* BytewiseComparator(); + +// Return a builtin comparator that uses reverse lexicographic byte-wise +// ordering. +extern const Comparator* ReverseBytewiseComparator(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/concurrent_task_limiter.h b/src/rocksdb/include/rocksdb/concurrent_task_limiter.h new file mode 100644 index 000000000..4fc6b7940 --- /dev/null +++ b/src/rocksdb/include/rocksdb/concurrent_task_limiter.h @@ -0,0 +1,46 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/env.h" +#include "rocksdb/statistics.h" + +namespace ROCKSDB_NAMESPACE { + +class ConcurrentTaskLimiter { + public: + virtual ~ConcurrentTaskLimiter() {} + + // Returns a name that identifies this concurrent task limiter. + virtual const std::string& GetName() const = 0; + + // Set max concurrent tasks. + // limit = 0 means no new task allowed. + // limit < 0 means no limitation. + virtual void SetMaxOutstandingTask(int32_t limit) = 0; + + // Reset to unlimited max concurrent task. + virtual void ResetMaxOutstandingTask() = 0; + + // Returns current outstanding task count. + virtual int32_t GetOutstandingTask() const = 0; +}; + +// Create a ConcurrentTaskLimiter that can be shared with mulitple CFs +// across RocksDB instances to control concurrent tasks. +// +// @param name: Name of the limiter. +// @param limit: max concurrent tasks. +// limit = 0 means no new task allowed. +// limit < 0 means no limitation. +extern ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name, + int32_t limit); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/convenience.h b/src/rocksdb/include/rocksdb/convenience.h new file mode 100644 index 000000000..442303d94 --- /dev/null +++ b/src/rocksdb/include/rocksdb/convenience.h @@ -0,0 +1,351 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <string> +#include <unordered_map> +#include <vector> + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +// The following set of functions provide a way to construct RocksDB Options +// from a string or a string-to-string map. Here're the general rule of +// setting option values from strings by type. Some RocksDB types are also +// supported in these APIs. Please refer to the comment of the function itself +// to find more information about how to config those RocksDB types. +// +// * Strings: +// Strings will be used as values directly without any truncating or +// trimming. +// +// * Booleans: +// - "true" or "1" => true +// - "false" or "0" => false. +// [Example]: +// - {"optimize_filters_for_hits", "1"} in GetColumnFamilyOptionsFromMap, or +// - "optimize_filters_for_hits=true" in GetColumnFamilyOptionsFromString. +// +// * Integers: +// Integers are converted directly from string, in addition to the following +// units that we support: +// - 'k' or 'K' => 2^10 +// - 'm' or 'M' => 2^20 +// - 'g' or 'G' => 2^30 +// - 't' or 'T' => 2^40 // only for unsigned int with sufficient bits. +// [Example]: +// - {"arena_block_size", "19G"} in GetColumnFamilyOptionsFromMap, or +// - "arena_block_size=19G" in GetColumnFamilyOptionsFromString. +// +// * Doubles / Floating Points: +// Doubles / Floating Points are converted directly from string. Note that +// currently we do not support units. +// [Example]: +// - {"hard_rate_limit", "2.1"} in GetColumnFamilyOptionsFromMap, or +// - "hard_rate_limit=2.1" in GetColumnFamilyOptionsFromString. +// * Array / Vectors: +// An array is specified by a list of values, where ':' is used as +// the delimiter to separate each value. +// [Example]: +// - {"compression_per_level", "kNoCompression:kSnappyCompression"} +// in GetColumnFamilyOptionsFromMap, or +// - "compression_per_level=kNoCompression:kSnappyCompression" in +// GetColumnFamilyOptionsFromMapString +// * Enums: +// The valid values of each enum are identical to the names of its constants. +// [Example]: +// - CompressionType: valid values are "kNoCompression", +// "kSnappyCompression", "kZlibCompression", "kBZip2Compression", ... +// - CompactionStyle: valid values are "kCompactionStyleLevel", +// "kCompactionStyleUniversal", "kCompactionStyleFIFO", and +// "kCompactionStyleNone". +// + +// Take a default ColumnFamilyOptions "base_options" in addition to a +// map "opts_map" of option name to option value to construct the new +// ColumnFamilyOptions "new_options". +// +// Below are the instructions of how to config some non-primitive-typed +// options in ColumnFOptions: +// +// * table_factory: +// table_factory can be configured using our custom nested-option syntax. +// +// {option_a=value_a; option_b=value_b; option_c=value_c; ... } +// +// A nested option is enclosed by two curly braces, within which there are +// multiple option assignments. Each assignment is of the form +// "variable_name=value;". +// +// Currently we support the following types of TableFactory: +// - BlockBasedTableFactory: +// Use name "block_based_table_factory" to initialize table_factory with +// BlockBasedTableFactory. Its BlockBasedTableFactoryOptions can be +// configured using the nested-option syntax. +// [Example]: +// * {"block_based_table_factory", "{block_cache=1M;block_size=4k;}"} +// is equivalent to assigning table_factory with a BlockBasedTableFactory +// that has 1M LRU block-cache with block size equals to 4k: +// ColumnFamilyOptions cf_opt; +// BlockBasedTableOptions blk_opt; +// blk_opt.block_cache = NewLRUCache(1 * 1024 * 1024); +// blk_opt.block_size = 4 * 1024; +// cf_opt.table_factory.reset(NewBlockBasedTableFactory(blk_opt)); +// - PlainTableFactory: +// Use name "plain_table_factory" to initialize table_factory with +// PlainTableFactory. Its PlainTableFactoryOptions can be configured using +// the nested-option syntax. +// [Example]: +// * {"plain_table_factory", "{user_key_len=66;bloom_bits_per_key=20;}"} +// +// * memtable_factory: +// Use "memtable" to config memtable_factory. Here are the supported +// memtable factories: +// - SkipList: +// Pass "skip_list:<lookahead>" to config memtable to use SkipList, +// or simply "skip_list" to use the default SkipList. +// [Example]: +// * {"memtable", "skip_list:5"} is equivalent to setting +// memtable to SkipListFactory(5). +// - PrefixHash: +// Pass "prfix_hash:<hash_bucket_count>" to config memtable +// to use PrefixHash, or simply "prefix_hash" to use the default +// PrefixHash. +// [Example]: +// * {"memtable", "prefix_hash:1000"} is equivalent to setting +// memtable to NewHashSkipListRepFactory(hash_bucket_count). +// - HashLinkedList: +// Pass "hash_linkedlist:<hash_bucket_count>" to config memtable +// to use HashLinkedList, or simply "hash_linkedlist" to use the default +// HashLinkedList. +// [Example]: +// * {"memtable", "hash_linkedlist:1000"} is equivalent to +// setting memtable to NewHashLinkListRepFactory(1000). +// - VectorRepFactory: +// Pass "vector:<count>" to config memtable to use VectorRepFactory, +// or simply "vector" to use the default Vector memtable. +// [Example]: +// * {"memtable", "vector:1024"} is equivalent to setting memtable +// to VectorRepFactory(1024). +// - HashCuckooRepFactory: +// Pass "cuckoo:<write_buffer_size>" to use HashCuckooRepFactory with the +// specified write buffer size, or simply "cuckoo" to use the default +// HashCuckooRepFactory. +// [Example]: +// * {"memtable", "cuckoo:1024"} is equivalent to setting memtable +// to NewHashCuckooRepFactory(1024). +// +// * compression_opts: +// Use "compression_opts" to config compression_opts. The value format +// is of the form "<window_bits>:<level>:<strategy>:<max_dict_bytes>". +// [Example]: +// * {"compression_opts", "4:5:6:7"} is equivalent to setting: +// ColumnFamilyOptions cf_opt; +// cf_opt.compression_opts.window_bits = 4; +// cf_opt.compression_opts.level = 5; +// cf_opt.compression_opts.strategy = 6; +// cf_opt.compression_opts.max_dict_bytes = 7; +// +// @param base_options the default options of the output "new_options". +// @param opts_map an option name to value map for specifying how "new_options" +// should be set. +// @param new_options the resulting options based on "base_options" with the +// change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_options" will be set to "base_options". +Status GetColumnFamilyOptionsFromMap( + const ColumnFamilyOptions& base_options, + const std::unordered_map<std::string, std::string>& opts_map, + ColumnFamilyOptions* new_options, bool input_strings_escaped = false, + bool ignore_unknown_options = false); + +// Take a default DBOptions "base_options" in addition to a +// map "opts_map" of option name to option value to construct the new +// DBOptions "new_options". +// +// Below are the instructions of how to config some non-primitive-typed +// options in DBOptions: +// +// * rate_limiter_bytes_per_sec: +// RateLimiter can be configured directly by specifying its bytes_per_sec. +// [Example]: +// - Passing {"rate_limiter_bytes_per_sec", "1024"} is equivalent to +// passing NewGenericRateLimiter(1024) to rate_limiter_bytes_per_sec. +// +// @param base_options the default options of the output "new_options". +// @param opts_map an option name to value map for specifying how "new_options" +// should be set. +// @param new_options the resulting options based on "base_options" with the +// change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_options" will be set to "base_options". +Status GetDBOptionsFromMap( + const DBOptions& base_options, + const std::unordered_map<std::string, std::string>& opts_map, + DBOptions* new_options, bool input_strings_escaped = false, + bool ignore_unknown_options = false); + +// Take a default BlockBasedTableOptions "table_options" in addition to a +// map "opts_map" of option name to option value to construct the new +// BlockBasedTableOptions "new_table_options". +// +// Below are the instructions of how to config some non-primitive-typed +// options in BlockBasedTableOptions: +// +// * filter_policy: +// We currently only support the following FilterPolicy in the convenience +// functions: +// - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]" +// to specify BloomFilter. The above string is equivalent to calling +// NewBloomFilterPolicy(bits_per_key, use_block_based_builder). +// [Example]: +// - Pass {"filter_policy", "bloomfilter:4:true"} in +// GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits +// per key and use_block_based_builder enabled. +// +// * block_cache / block_cache_compressed: +// We currently only support LRU cache in the GetOptions API. The LRU +// cache can be set by directly specifying its size. +// [Example]: +// - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is +// equivalent to setting block_cache using NewLRUCache(1024 * 1024). +// +// @param table_options the default options of the output "new_table_options". +// @param opts_map an option name to value map for specifying how +// "new_table_options" should be set. +// @param new_table_options the resulting options based on "table_options" +// with the change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_table_options" will be set to +// "table_options". +Status GetBlockBasedTableOptionsFromMap( + const BlockBasedTableOptions& table_options, + const std::unordered_map<std::string, std::string>& opts_map, + BlockBasedTableOptions* new_table_options, + bool input_strings_escaped = false, bool ignore_unknown_options = false); + +// Take a default PlainTableOptions "table_options" in addition to a +// map "opts_map" of option name to option value to construct the new +// PlainTableOptions "new_table_options". +// +// @param table_options the default options of the output "new_table_options". +// @param opts_map an option name to value map for specifying how +// "new_table_options" should be set. +// @param new_table_options the resulting options based on "table_options" +// with the change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_table_options" will be set to +// "table_options". +Status GetPlainTableOptionsFromMap( + const PlainTableOptions& table_options, + const std::unordered_map<std::string, std::string>& opts_map, + PlainTableOptions* new_table_options, bool input_strings_escaped = false, + bool ignore_unknown_options = false); + +// Take a string representation of option names and values, apply them into the +// base_options, and return the new options as a result. The string has the +// following format: +// "write_buffer_size=1024;max_write_buffer_number=2" +// Nested options config is also possible. For example, you can define +// BlockBasedTableOptions as part of the string for block-based table factory: +// "write_buffer_size=1024;block_based_table_factory={block_size=4k};" +// "max_write_buffer_num=2" +Status GetColumnFamilyOptionsFromString(const ColumnFamilyOptions& base_options, + const std::string& opts_str, + ColumnFamilyOptions* new_options); + +Status GetDBOptionsFromString(const DBOptions& base_options, + const std::string& opts_str, + DBOptions* new_options); + +Status GetStringFromDBOptions(std::string* opts_str, + const DBOptions& db_options, + const std::string& delimiter = "; "); + +Status GetStringFromColumnFamilyOptions(std::string* opts_str, + const ColumnFamilyOptions& cf_options, + const std::string& delimiter = "; "); + +Status GetStringFromCompressionType(std::string* compression_str, + CompressionType compression_type); + +std::vector<CompressionType> GetSupportedCompressions(); + +Status GetBlockBasedTableOptionsFromString( + const BlockBasedTableOptions& table_options, const std::string& opts_str, + BlockBasedTableOptions* new_table_options); + +Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options, + const std::string& opts_str, + PlainTableOptions* new_table_options); + +Status GetMemTableRepFactoryFromString( + const std::string& opts_str, + std::unique_ptr<MemTableRepFactory>* new_mem_factory); + +Status GetOptionsFromString(const Options& base_options, + const std::string& opts_str, Options* new_options); + +Status StringToMap(const std::string& opts_str, + std::unordered_map<std::string, std::string>* opts_map); + +// Request stopping background work, if wait is true wait until it's done +void CancelAllBackgroundWork(DB* db, bool wait = false); + +// Delete files which are entirely in the given range +// Could leave some keys in the range which are in files which are not +// entirely in the range. Also leaves L0 files regardless of whether they're +// in the range. +// Snapshots before the delete might not see the data in the given range. +Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool include_end = true); + +// Delete files in multiple ranges at once +// Delete files in a lot of ranges one at a time can be slow, use this API for +// better performance in that case. +Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, + bool include_end = true); + +// Verify the checksum of file +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const std::string& file_path); + +// Verify the checksum of file +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const ReadOptions& read_options, + const std::string& file_path); + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/db.h b/src/rocksdb/include/rocksdb/db.h new file mode 100644 index 000000000..3108003f1 --- /dev/null +++ b/src/rocksdb/include/rocksdb/db.h @@ -0,0 +1,1525 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <stdint.h> +#include <stdio.h> +#include <map> +#include <memory> +#include <string> +#include <unordered_map> +#include <vector> +#include "rocksdb/iterator.h" +#include "rocksdb/listener.h" +#include "rocksdb/metadata.h" +#include "rocksdb/options.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/sst_file_writer.h" +#include "rocksdb/thread_status.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/types.h" +#include "rocksdb/version.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef DeleteFile +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__)) +#elif _WIN32 +#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated) +#endif + +namespace ROCKSDB_NAMESPACE { + +struct Options; +struct DBOptions; +struct ColumnFamilyOptions; +struct ReadOptions; +struct WriteOptions; +struct FlushOptions; +struct CompactionOptions; +struct CompactRangeOptions; +struct TableProperties; +struct ExternalSstFileInfo; +class WriteBatch; +class Env; +class EventListener; +class StatsHistoryIterator; +class TraceWriter; +#ifdef ROCKSDB_LITE +class CompactionJobInfo; +#endif +class FileSystem; + +extern const std::string kDefaultColumnFamilyName; +extern const std::string kPersistentStatsColumnFamilyName; +struct ColumnFamilyDescriptor { + std::string name; + ColumnFamilyOptions options; + ColumnFamilyDescriptor() + : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {} + ColumnFamilyDescriptor(const std::string& _name, + const ColumnFamilyOptions& _options) + : name(_name), options(_options) {} +}; + +class ColumnFamilyHandle { + public: + virtual ~ColumnFamilyHandle() {} + // Returns the name of the column family associated with the current handle. + virtual const std::string& GetName() const = 0; + // Returns the ID of the column family associated with the current handle. + virtual uint32_t GetID() const = 0; + // Fills "*desc" with the up-to-date descriptor of the column family + // associated with this handle. Since it fills "*desc" with the up-to-date + // information, this call might internally lock and release DB mutex to + // access the up-to-date CF options. In addition, all the pointer-typed + // options cannot be referenced any longer than the original options exist. + // + // Note that this function is not supported in RocksDBLite. + virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) = 0; + // Returns the comparator of the column family associated with the + // current handle. + virtual const Comparator* GetComparator() const = 0; +}; + +static const int kMajorVersion = __ROCKSDB_MAJOR__; +static const int kMinorVersion = __ROCKSDB_MINOR__; + +// A range of keys +struct Range { + Slice start; + Slice limit; + + Range() {} + Range(const Slice& s, const Slice& l) : start(s), limit(l) {} +}; + +struct RangePtr { + const Slice* start; + const Slice* limit; + + RangePtr() : start(nullptr), limit(nullptr) {} + RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {} +}; + +struct IngestExternalFileArg { + ColumnFamilyHandle* column_family = nullptr; + std::vector<std::string> external_files; + IngestExternalFileOptions options; +}; + +struct GetMergeOperandsOptions { + int expected_max_number_of_operands = 0; +}; + +// A collections of table properties objects, where +// key: is the table's file name. +// value: the table properties object of the given table. +typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>> + TablePropertiesCollection; + +// A DB is a persistent ordered map from keys to values. +// A DB is safe for concurrent access from multiple threads without +// any external synchronization. +class DB { + public: + // Open the database with the specified "name". + // Stores a pointer to a heap-allocated database in *dbptr and returns + // OK on success. + // Stores nullptr in *dbptr and returns a non-OK status on error. + // Caller should delete *dbptr when it is no longer needed. + static Status Open(const Options& options, const std::string& name, + DB** dbptr); + + // Open the database for read only. All DB interfaces + // that modify data, like put/delete, will return error. + // If the db is opened in read only mode, then no compactions + // will happen. + // + // Not supported in ROCKSDB_LITE, in which case the function will + // return Status::NotSupported. + static Status OpenForReadOnly(const Options& options, const std::string& name, + DB** dbptr, + bool error_if_log_file_exist = false); + + // Open the database for read only with column families. When opening DB with + // read only, you can specify only a subset of column families in the + // database that should be opened. However, you always need to specify default + // column family. The default column family name is 'default' and it's stored + // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName + // + // Not supported in ROCKSDB_LITE, in which case the function will + // return Status::NotSupported. + static Status OpenForReadOnly( + const DBOptions& db_options, const std::string& name, + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles, DB** dbptr, + bool error_if_log_file_exist = false); + + // The following OpenAsSecondary functions create a secondary instance that + // can dynamically tail the MANIFEST of a primary that must have already been + // created. User can call TryCatchUpWithPrimary to make the secondary + // instance catch up with primary (WAL tailing is NOT supported now) whenever + // the user feels necessary. Column families created by the primary after the + // secondary instance starts are currently ignored by the secondary instance. + // Column families opened by secondary and dropped by the primary will be + // dropped by secondary as well. However the user of the secondary instance + // can still access the data of such dropped column family as long as they + // do not destroy the corresponding column family handle. + // WAL tailing is not supported at present, but will arrive soon. + // + // The options argument specifies the options to open the secondary instance. + // The name argument specifies the name of the primary db that you have used + // to open the primary instance. + // The secondary_path argument points to a directory where the secondary + // instance stores its info log. + // The dbptr is an out-arg corresponding to the opened secondary instance. + // The pointer points to a heap-allocated database, and the user should + // delete it after use. + // Open DB as secondary instance with only the default column family. + // Return OK on success, non-OK on failures. + static Status OpenAsSecondary(const Options& options, const std::string& name, + const std::string& secondary_path, DB** dbptr); + + // Open DB as secondary instance with column families. You can open a subset + // of column families in secondary mode. + // The db_options specify the database specific options. + // The name argument specifies the name of the primary db that you have used + // to open the primary instance. + // The secondary_path argument points to a directory where the secondary + // instance stores its info log. + // The column_families argument specifieds a list of column families to open. + // If any of the column families does not exist, the function returns non-OK + // status. + // The handles is an out-arg corresponding to the opened database column + // familiy handles. + // The dbptr is an out-arg corresponding to the opened secondary instance. + // The pointer points to a heap-allocated database, and the caller should + // delete it after use. Before deleting the dbptr, the user should also + // delete the pointers stored in handles vector. + // Return OK on success, on-OK on failures. + static Status OpenAsSecondary( + const DBOptions& db_options, const std::string& name, + const std::string& secondary_path, + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles, DB** dbptr); + + // Open DB with column families. + // db_options specify database specific options + // column_families is the vector of all column families in the database, + // containing column family name and options. You need to open ALL column + // families in the database. To get the list of column families, you can use + // ListColumnFamilies(). Also, you can open only a subset of column families + // for read-only access. + // The default column family name is 'default' and it's stored + // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName. + // If everything is OK, handles will on return be the same size + // as column_families --- handles[i] will be a handle that you + // will use to operate on column family column_family[i]. + // Before delete DB, you have to close All column families by calling + // DestroyColumnFamilyHandle() with all the handles. + static Status Open(const DBOptions& db_options, const std::string& name, + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles, DB** dbptr); + + virtual Status Resume() { return Status::NotSupported(); } + + // Close the DB by releasing resources, closing files etc. This should be + // called before calling the destructor so that the caller can get back a + // status in case there are any errors. This will not fsync the WAL files. + // If syncing is required, the caller must first call SyncWAL(), or Write() + // using an empty write batch with WriteOptions.sync=true. + // Regardless of the return status, the DB must be freed. + // If the return status is Aborted(), closing fails because there is + // unreleased snapshot in the system. In this case, users can release + // the unreleased snapshots and try again and expect it to succeed. For + // other status, recalling Close() will be no-op. + // If the return status is NotSupported(), then the DB implementation does + // cleanup in the destructor + virtual Status Close() { return Status::NotSupported(); } + + // ListColumnFamilies will open the DB specified by argument name + // and return the list of all column families in that DB + // through column_families argument. The ordering of + // column families in column_families is unspecified. + static Status ListColumnFamilies(const DBOptions& db_options, + const std::string& name, + std::vector<std::string>* column_families); + + DB() {} + // No copying allowed + DB(const DB&) = delete; + void operator=(const DB&) = delete; + + virtual ~DB(); + + // Create a column_family and return the handle of column family + // through the argument handle. + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle); + + // Bulk create column families with the same column family options. + // Return the handles of the column families through the argument handles. + // In case of error, the request may succeed partially, and handles will + // contain column family handles that it managed to create, and have size + // equal to the number of created column families. + virtual Status CreateColumnFamilies( + const ColumnFamilyOptions& options, + const std::vector<std::string>& column_family_names, + std::vector<ColumnFamilyHandle*>* handles); + + // Bulk create column families. + // Return the handles of the column families through the argument handles. + // In case of error, the request may succeed partially, and handles will + // contain column family handles that it managed to create, and have size + // equal to the number of created column families. + virtual Status CreateColumnFamilies( + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles); + + // Drop a column family specified by column_family handle. This call + // only records a drop record in the manifest and prevents the column + // family from flushing and compacting. + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family); + + // Bulk drop column families. This call only records drop records in the + // manifest and prevents the column families from flushing and compacting. + // In case of error, the request may succeed partially. User may call + // ListColumnFamilies to check the result. + virtual Status DropColumnFamilies( + const std::vector<ColumnFamilyHandle*>& column_families); + + // Close a column family specified by column_family handle and destroy + // the column family handle specified to avoid double deletion. This call + // deletes the column family handle by default. Use this method to + // close column family instead of deleting column family handle directly + virtual Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family); + + // Set the database entry for "key" to "value". + // If "key" already exists, it will be overwritten. + // Returns OK on success, and a non-OK status on error. + // Note: consider setting options.sync = true. + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Put(const WriteOptions& options, const Slice& key, + const Slice& value) { + return Put(options, DefaultColumnFamily(), key, value); + } + + // Remove the database entry (if any) for "key". Returns OK on + // success, and a non-OK status on error. It is not an error if "key" + // did not exist in the database. + // Note: consider setting options.sync = true. + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status Delete(const WriteOptions& options, const Slice& key) { + return Delete(options, DefaultColumnFamily(), key); + } + + // Remove the database entry for "key". Requires that the key exists + // and was not overwritten. Returns OK on success, and a non-OK status + // on error. It is not an error if "key" did not exist in the database. + // + // If a key is overwritten (by calling Put() multiple times), then the result + // of calling SingleDelete() on this key is undefined. SingleDelete() only + // behaves correctly if there has been only one Put() for this key since the + // previous call to SingleDelete() for this key. + // + // This feature is currently an experimental performance optimization + // for a very specific workload. It is up to the caller to ensure that + // SingleDelete is only used for a key that is not deleted using Delete() or + // written using Merge(). Mixing SingleDelete operations with Deletes and + // Merges can result in undefined behavior. + // + // Note: consider setting options.sync = true. + virtual Status SingleDelete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status SingleDelete(const WriteOptions& options, const Slice& key) { + return SingleDelete(options, DefaultColumnFamily(), key); + } + + // Removes the database entries in the range ["begin_key", "end_key"), i.e., + // including "begin_key" and excluding "end_key". Returns OK on success, and + // a non-OK status on error. It is not an error if no keys exist in the range + // ["begin_key", "end_key"). + // + // This feature is now usable in production, with the following caveats: + // 1) Accumulating many range tombstones in the memtable will degrade read + // performance; this can be avoided by manually flushing occasionally. + // 2) Limiting the maximum number of open files in the presence of range + // tombstones can degrade read performance. To avoid this problem, set + // max_open_files to -1 whenever possible. + virtual Status DeleteRange(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key); + + // Merge the database entry for "key" with "value". Returns OK on success, + // and a non-OK status on error. The semantics of this operation is + // determined by the user provided merge_operator when opening DB. + // Note: consider setting options.sync = true. + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Merge(const WriteOptions& options, const Slice& key, + const Slice& value) { + return Merge(options, DefaultColumnFamily(), key, value); + } + + // Apply the specified updates to the database. + // If `updates` contains no update, WAL will still be synced if + // options.sync=true. + // Returns OK on success, non-OK on failure. + // Note: consider setting options.sync = true. + virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; + + // If the database contains an entry for "key" store the + // corresponding value in *value and return OK. + // + // If there is no entry for "key" leave *value unchanged and return + // a status for which Status::IsNotFound() returns true. + // + // May return some other Status on an error. + virtual inline Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = Get(options, column_family, key, &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; + } + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) = 0; + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value) { + return Get(options, DefaultColumnFamily(), key, value); + } + + // Returns all the merge operands corresponding to the key. If the + // number of merge operands in DB is greater than + // merge_operands_options.expected_max_number_of_operands + // no merge operands are returned and status is Incomplete. Merge operands + // returned are in the order of insertion. + // merge_operands- Points to an array of at-least + // merge_operands_options.expected_max_number_of_operands and the + // caller is responsible for allocating it. If the status + // returned is Incomplete then number_of_operands will contain + // the total number of merge operands found in DB for key. + virtual Status GetMergeOperands( + const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* merge_operands, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) = 0; + + // If keys[i] does not exist in the database, then the i'th returned + // status will be one for which Status::IsNotFound() is true, and + // (*values)[i] will be set to some arbitrary value (often ""). Otherwise, + // the i'th returned status will have Status::ok() true, and (*values)[i] + // will store the value associated with keys[i]. + // + // (*values) will always be resized to be the same size as (keys). + // Similarly, the number of returned statuses will be the number of keys. + // Note: keys will not be "de-duplicated". Duplicate keys will return + // duplicate values in order. + virtual std::vector<Status> MultiGet( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_family, + const std::vector<Slice>& keys, std::vector<std::string>* values) = 0; + virtual std::vector<Status> MultiGet(const ReadOptions& options, + const std::vector<Slice>& keys, + std::vector<std::string>* values) { + return MultiGet( + options, + std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()), + keys, values); + } + + // Overloaded MultiGet API that improves performance by batching operations + // in the read path for greater efficiency. Currently, only the block based + // table format with full filters are supported. Other table formats such + // as plain table, block based table with block based filters and + // partitioned indexes will still work, but will not get any performance + // benefits. + // Parameters - + // options - ReadOptions + // column_family - ColumnFamilyHandle* that the keys belong to. All the keys + // passed to the API are restricted to a single column family + // num_keys - Number of keys to lookup + // keys - Pointer to C style array of key Slices with num_keys elements + // values - Pointer to C style array of PinnableSlices with num_keys elements + // statuses - Pointer to C style array of Status with num_keys elements + // sorted_input - If true, it means the input keys are already sorted by key + // order, so the MultiGet() API doesn't have to sort them + // again. If false, the keys will be copied and sorted + // internally by the API - the input array will not be + // modified + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool /*sorted_input*/ = false) { + std::vector<ColumnFamilyHandle*> cf; + std::vector<Slice> user_keys; + std::vector<Status> status; + std::vector<std::string> vals; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_family); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals); + std::copy(status.begin(), status.end(), statuses); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } + + // Overloaded MultiGet API that improves performance by batching operations + // in the read path for greater efficiency. Currently, only the block based + // table format with full filters are supported. Other table formats such + // as plain table, block based table with block based filters and + // partitioned indexes will still work, but will not get any performance + // benefits. + // Parameters - + // options - ReadOptions + // column_family - ColumnFamilyHandle* that the keys belong to. All the keys + // passed to the API are restricted to a single column family + // num_keys - Number of keys to lookup + // keys - Pointer to C style array of key Slices with num_keys elements + // values - Pointer to C style array of PinnableSlices with num_keys elements + // statuses - Pointer to C style array of Status with num_keys elements + // sorted_input - If true, it means the input keys are already sorted by key + // order, so the MultiGet() API doesn't have to sort them + // again. If false, the keys will be copied and sorted + // internally by the API - the input array will not be + // modified + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool /*sorted_input*/ = false) { + std::vector<ColumnFamilyHandle*> cf; + std::vector<Slice> user_keys; + std::vector<Status> status; + std::vector<std::string> vals; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_families[i]); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals); + std::copy(status.begin(), status.end(), statuses); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } + + // If the key definitely does not exist in the database, then this method + // returns false, else true. If the caller wants to obtain value when the key + // is found in memory, a bool for 'value_found' must be passed. 'value_found' + // will be true on return if value has been set properly. + // This check is potentially lighter-weight than invoking DB::Get(). One way + // to make this lighter weight is to avoid doing any IOs. + // Default implementation here returns true and sets 'value_found' to false + virtual bool KeyMayExist(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, std::string* /*value*/, + bool* value_found = nullptr) { + if (value_found != nullptr) { + *value_found = false; + } + return true; + } + virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, + std::string* value, bool* value_found = nullptr) { + return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found); + } + + // Return a heap-allocated iterator over the contents of the database. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // + // Caller should delete the iterator when it is no longer needed. + // The returned iterator should be deleted before this db is deleted. + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) = 0; + virtual Iterator* NewIterator(const ReadOptions& options) { + return NewIterator(options, DefaultColumnFamily()); + } + // Returns iterators from a consistent database state across multiple + // column families. Iterators are heap allocated and need to be deleted + // before the db is deleted + virtual Status NewIterators( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_families, + std::vector<Iterator*>* iterators) = 0; + + // Return a handle to the current DB state. Iterators created with + // this handle will all observe a stable snapshot of the current DB + // state. The caller must call ReleaseSnapshot(result) when the + // snapshot is no longer needed. + // + // nullptr will be returned if the DB fails to take a snapshot or does + // not support snapshot. + virtual const Snapshot* GetSnapshot() = 0; + + // Release a previously acquired snapshot. The caller must not + // use "snapshot" after this call. + virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; + +#ifndef ROCKSDB_LITE + // Contains all valid property arguments for GetProperty(). + // + // NOTE: Property names cannot end in numbers since those are interpreted as + // arguments, e.g., see kNumFilesAtLevelPrefix. + struct Properties { + // "rocksdb.num-files-at-level<N>" - returns string containing the number + // of files at level <N>, where <N> is an ASCII representation of a + // level number (e.g., "0"). + static const std::string kNumFilesAtLevelPrefix; + + // "rocksdb.compression-ratio-at-level<N>" - returns string containing the + // compression ratio of data at level <N>, where <N> is an ASCII + // representation of a level number (e.g., "0"). Here, compression + // ratio is defined as uncompressed data size / compressed file size. + // Returns "-1.0" if no open files at level <N>. + static const std::string kCompressionRatioAtLevelPrefix; + + // "rocksdb.stats" - returns a multi-line string containing the data + // described by kCFStats followed by the data described by kDBStats. + static const std::string kStats; + + // "rocksdb.sstables" - returns a multi-line string summarizing current + // SST files. + static const std::string kSSTables; + + // "rocksdb.cfstats" - Both of "rocksdb.cfstats-no-file-histogram" and + // "rocksdb.cf-file-histogram" together. See below for description + // of the two. + static const std::string kCFStats; + + // "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with + // general columm family stats per-level over db's lifetime ("L<n>"), + // aggregated over db's lifetime ("Sum"), and aggregated over the + // interval since the last retrieval ("Int"). + // It could also be used to return the stats in the format of the map. + // In this case there will a pair of string to array of double for + // each level as well as for "Sum". "Int" stats will not be affected + // when this form of stats are retrieved. + static const std::string kCFStatsNoFileHistogram; + + // "rocksdb.cf-file-histogram" - print out how many file reads to every + // level, as well as the histogram of latency of single requests. + static const std::string kCFFileHistogram; + + // "rocksdb.dbstats" - returns a multi-line string with general database + // stats, both cumulative (over the db's lifetime) and interval (since + // the last retrieval of kDBStats). + static const std::string kDBStats; + + // "rocksdb.levelstats" - returns multi-line string containing the number + // of files per level and total size of each level (MB). + static const std::string kLevelStats; + + // "rocksdb.num-immutable-mem-table" - returns number of immutable + // memtables that have not yet been flushed. + static const std::string kNumImmutableMemTable; + + // "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable + // memtables that have already been flushed. + static const std::string kNumImmutableMemTableFlushed; + + // "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is + // pending; otherwise, returns 0. + static const std::string kMemTableFlushPending; + + // "rocksdb.num-running-flushes" - returns the number of currently running + // flushes. + static const std::string kNumRunningFlushes; + + // "rocksdb.compaction-pending" - returns 1 if at least one compaction is + // pending; otherwise, returns 0. + static const std::string kCompactionPending; + + // "rocksdb.num-running-compactions" - returns the number of currently + // running compactions. + static const std::string kNumRunningCompactions; + + // "rocksdb.background-errors" - returns accumulated number of background + // errors. + static const std::string kBackgroundErrors; + + // "rocksdb.cur-size-active-mem-table" - returns approximate size of active + // memtable (bytes). + static const std::string kCurSizeActiveMemTable; + + // "rocksdb.cur-size-all-mem-tables" - returns approximate size of active + // and unflushed immutable memtables (bytes). + static const std::string kCurSizeAllMemTables; + + // "rocksdb.size-all-mem-tables" - returns approximate size of active, + // unflushed immutable, and pinned immutable memtables (bytes). + static const std::string kSizeAllMemTables; + + // "rocksdb.num-entries-active-mem-table" - returns total number of entries + // in the active memtable. + static const std::string kNumEntriesActiveMemTable; + + // "rocksdb.num-entries-imm-mem-tables" - returns total number of entries + // in the unflushed immutable memtables. + static const std::string kNumEntriesImmMemTables; + + // "rocksdb.num-deletes-active-mem-table" - returns total number of delete + // entries in the active memtable. + static const std::string kNumDeletesActiveMemTable; + + // "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete + // entries in the unflushed immutable memtables. + static const std::string kNumDeletesImmMemTables; + + // "rocksdb.estimate-num-keys" - returns estimated number of total keys in + // the active and unflushed immutable memtables and storage. + static const std::string kEstimateNumKeys; + + // "rocksdb.estimate-table-readers-mem" - returns estimated memory used for + // reading SST tables, excluding memory used in block cache (e.g., + // filter and index blocks). + static const std::string kEstimateTableReadersMem; + + // "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete + // files is enabled; otherwise, returns a non-zero number. + static const std::string kIsFileDeletionsEnabled; + + // "rocksdb.num-snapshots" - returns number of unreleased snapshots of the + // database. + static const std::string kNumSnapshots; + + // "rocksdb.oldest-snapshot-time" - returns number representing unix + // timestamp of oldest unreleased snapshot. + static const std::string kOldestSnapshotTime; + + // "rocksdb.oldest-snapshot-sequence" - returns number representing + // sequence number of oldest unreleased snapshot. + static const std::string kOldestSnapshotSequence; + + // "rocksdb.num-live-versions" - returns number of live versions. `Version` + // is an internal data structure. See version_set.h for details. More + // live versions often mean more SST files are held from being deleted, + // by iterators or unfinished compactions. + static const std::string kNumLiveVersions; + + // "rocksdb.current-super-version-number" - returns number of current LSM + // version. It is a uint64_t integer number, incremented after there is + // any change to the LSM tree. The number is not preserved after restarting + // the DB. After DB restart, it will start from 0 again. + static const std::string kCurrentSuperVersionNumber; + + // "rocksdb.estimate-live-data-size" - returns an estimate of the amount of + // live data in bytes. + static const std::string kEstimateLiveDataSize; + + // "rocksdb.min-log-number-to-keep" - return the minimum log number of the + // log files that should be kept. + static const std::string kMinLogNumberToKeep; + + // "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file + // number for an obsolete SST to be kept. The max value of `uint64_t` + // will be returned if all obsolete files can be deleted. + static const std::string kMinObsoleteSstNumberToKeep; + + // "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST + // files. + // WARNING: may slow down online queries if there are too many files. + static const std::string kTotalSstFilesSize; + + // "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST + // files belong to the latest LSM tree. + static const std::string kLiveSstFilesSize; + + // "rocksdb.base-level" - returns number of level to which L0 data will be + // compacted. + static const std::string kBaseLevel; + + // "rocksdb.estimate-pending-compaction-bytes" - returns estimated total + // number of bytes compaction needs to rewrite to get all levels down + // to under target size. Not valid for other compactions than level- + // based. + static const std::string kEstimatePendingCompactionBytes; + + // "rocksdb.aggregated-table-properties" - returns a string representation + // of the aggregated table properties of the target column family. + static const std::string kAggregatedTableProperties; + + // "rocksdb.aggregated-table-properties-at-level<N>", same as the previous + // one but only returns the aggregated table properties of the + // specified level "N" at the target column family. + static const std::string kAggregatedTablePropertiesAtLevel; + + // "rocksdb.actual-delayed-write-rate" - returns the current actual delayed + // write rate. 0 means no delay. + static const std::string kActualDelayedWriteRate; + + // "rocksdb.is-write-stopped" - Return 1 if write has been stopped. + static const std::string kIsWriteStopped; + + // "rocksdb.estimate-oldest-key-time" - returns an estimation of + // oldest key timestamp in the DB. Currently only available for + // FIFO compaction with + // compaction_options_fifo.allow_compaction = false. + static const std::string kEstimateOldestKeyTime; + + // "rocksdb.block-cache-capacity" - returns block cache capacity. + static const std::string kBlockCacheCapacity; + + // "rocksdb.block-cache-usage" - returns the memory size for the entries + // residing in block cache. + static const std::string kBlockCacheUsage; + + // "rocksdb.block-cache-pinned-usage" - returns the memory size for the + // entries being pinned. + static const std::string kBlockCachePinnedUsage; + + // "rocksdb.options-statistics" - returns multi-line string + // of options.statistics + static const std::string kOptionsStatistics; + }; +#endif /* ROCKSDB_LITE */ + + // DB implementations can export properties about their state via this method. + // If "property" is a valid property understood by this DB implementation (see + // Properties struct above for valid options), fills "*value" with its current + // value and returns true. Otherwise, returns false. + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) = 0; + virtual bool GetProperty(const Slice& property, std::string* value) { + return GetProperty(DefaultColumnFamily(), property, value); + } + virtual bool GetMapProperty(ColumnFamilyHandle* column_family, + const Slice& property, + std::map<std::string, std::string>* value) = 0; + virtual bool GetMapProperty(const Slice& property, + std::map<std::string, std::string>* value) { + return GetMapProperty(DefaultColumnFamily(), property, value); + } + + // Similar to GetProperty(), but only works for a subset of properties whose + // return value is an integer. Return the value by integer. Supported + // properties: + // "rocksdb.num-immutable-mem-table" + // "rocksdb.mem-table-flush-pending" + // "rocksdb.compaction-pending" + // "rocksdb.background-errors" + // "rocksdb.cur-size-active-mem-table" + // "rocksdb.cur-size-all-mem-tables" + // "rocksdb.size-all-mem-tables" + // "rocksdb.num-entries-active-mem-table" + // "rocksdb.num-entries-imm-mem-tables" + // "rocksdb.num-deletes-active-mem-table" + // "rocksdb.num-deletes-imm-mem-tables" + // "rocksdb.estimate-num-keys" + // "rocksdb.estimate-table-readers-mem" + // "rocksdb.is-file-deletions-enabled" + // "rocksdb.num-snapshots" + // "rocksdb.oldest-snapshot-time" + // "rocksdb.num-live-versions" + // "rocksdb.current-super-version-number" + // "rocksdb.estimate-live-data-size" + // "rocksdb.min-log-number-to-keep" + // "rocksdb.min-obsolete-sst-number-to-keep" + // "rocksdb.total-sst-files-size" + // "rocksdb.live-sst-files-size" + // "rocksdb.base-level" + // "rocksdb.estimate-pending-compaction-bytes" + // "rocksdb.num-running-compactions" + // "rocksdb.num-running-flushes" + // "rocksdb.actual-delayed-write-rate" + // "rocksdb.is-write-stopped" + // "rocksdb.estimate-oldest-key-time" + // "rocksdb.block-cache-capacity" + // "rocksdb.block-cache-usage" + // "rocksdb.block-cache-pinned-usage" + virtual bool GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) = 0; + virtual bool GetIntProperty(const Slice& property, uint64_t* value) { + return GetIntProperty(DefaultColumnFamily(), property, value); + } + + // Reset internal stats for DB and all column families. + // Note this doesn't reset options.statistics as it is not owned by + // DB. + virtual Status ResetStats() { + return Status::NotSupported("Not implemented"); + } + + // Same as GetIntProperty(), but this one returns the aggregated int + // property from all column families. + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* value) = 0; + + // Flags for DB::GetSizeApproximation that specify whether memtable + // stats should be included, or file stats approximation or both + enum SizeApproximationFlags : uint8_t { + NONE = 0, + INCLUDE_MEMTABLES = 1 << 0, + INCLUDE_FILES = 1 << 1 + }; + + // For each i in [0,n-1], store in "sizes[i]", the approximate + // file system space used by keys in "[range[i].start .. range[i].limit)". + // + // Note that the returned sizes measure file system space usage, so + // if the user data compresses by a factor of ten, the returned + // sizes will be one-tenth the size of the corresponding user data size. + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, + uint64_t* sizes) = 0; + + // Simpler versions of the GetApproximateSizes() method above. + // The include_flags argumenbt must of type DB::SizeApproximationFlags + // and can not be NONE. + virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes, + uint8_t include_flags = INCLUDE_FILES) { + SizeApproximationOptions options; + options.include_memtabtles = + (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0; + options.include_files = + (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0; + GetApproximateSizes(options, column_family, range, n, sizes); + } + virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes, + uint8_t include_flags = INCLUDE_FILES) { + GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags); + } + + // The method is similar to GetApproximateSizes, except it + // returns approximate number of records in memtables. + virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) = 0; + virtual void GetApproximateMemTableStats(const Range& range, + uint64_t* const count, + uint64_t* const size) { + GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size); + } + + // Deprecated versions of GetApproximateSizes + ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes( + const Range* range, int n, uint64_t* sizes, bool include_memtable) { + uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES; + if (include_memtable) { + include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES; + } + GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags); + } + ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes( + ColumnFamilyHandle* column_family, const Range* range, int n, + uint64_t* sizes, bool include_memtable) { + uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES; + if (include_memtable) { + include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES; + } + GetApproximateSizes(column_family, range, n, sizes, include_flags); + } + + // Compact the underlying storage for the key range [*begin,*end]. + // The actual compaction interval might be superset of [*begin, *end]. + // In particular, deleted and overwritten versions are discarded, + // and the data is rearranged to reduce the cost of operations + // needed to access the data. This operation should typically only + // be invoked by users who understand the underlying implementation. + // + // begin==nullptr is treated as a key before all keys in the database. + // end==nullptr is treated as a key after all keys in the database. + // Therefore the following call will compact the entire database: + // db->CompactRange(options, nullptr, nullptr); + // Note that after the entire database is compacted, all data are pushed + // down to the last level containing any data. If the total data size after + // compaction is reduced, that level might not be appropriate for hosting all + // the files. In this case, client could set options.change_level to true, to + // move the files back to the minimum level capable of holding the data set + // or a given level (specified by non-negative options.target_level). + virtual Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) = 0; + virtual Status CompactRange(const CompactRangeOptions& options, + const Slice* begin, const Slice* end) { + return CompactRange(options, DefaultColumnFamily(), begin, end); + } + + ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange( + ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end, + bool change_level = false, int target_level = -1, + uint32_t target_path_id = 0) { + CompactRangeOptions options; + options.change_level = change_level; + options.target_level = target_level; + options.target_path_id = target_path_id; + return CompactRange(options, column_family, begin, end); + } + + ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange( + const Slice* begin, const Slice* end, bool change_level = false, + int target_level = -1, uint32_t target_path_id = 0) { + CompactRangeOptions options; + options.change_level = change_level; + options.target_level = target_level; + options.target_path_id = target_path_id; + return CompactRange(options, DefaultColumnFamily(), begin, end); + } + + virtual Status SetOptions( + ColumnFamilyHandle* /*column_family*/, + const std::unordered_map<std::string, std::string>& /*new_options*/) { + return Status::NotSupported("Not implemented"); + } + virtual Status SetOptions( + const std::unordered_map<std::string, std::string>& new_options) { + return SetOptions(DefaultColumnFamily(), new_options); + } + + virtual Status SetDBOptions( + const std::unordered_map<std::string, std::string>& new_options) = 0; + + // CompactFiles() inputs a list of files specified by file numbers and + // compacts them to the specified level. Note that the behavior is different + // from CompactRange() in that CompactFiles() performs the compaction job + // using the CURRENT thread. + // + // @see GetDataBaseMetaData + // @see GetColumnFamilyMetaData + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector<std::string>& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector<std::string>* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) = 0; + + virtual Status CompactFiles( + const CompactionOptions& compact_options, + const std::vector<std::string>& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector<std::string>* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) { + return CompactFiles(compact_options, DefaultColumnFamily(), + input_file_names, output_level, output_path_id, + output_file_names, compaction_job_info); + } + + // This function will wait until all currently running background processes + // finish. After it returns, no background process will be run until + // ContinueBackgroundWork is called + virtual Status PauseBackgroundWork() = 0; + virtual Status ContinueBackgroundWork() = 0; + + // This function will enable automatic compactions for the given column + // families if they were previously disabled. The function will first set the + // disable_auto_compactions option for each column family to 'false', after + // which it will schedule a flush/compaction. + // + // NOTE: Setting disable_auto_compactions to 'false' through SetOptions() API + // does NOT schedule a flush/compaction afterwards, and only changes the + // parameter itself within the column family option. + // + virtual Status EnableAutoCompaction( + const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0; + + virtual void DisableManualCompaction() = 0; + virtual void EnableManualCompaction() = 0; + + // Number of levels used for this DB. + virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; + virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); } + + // Maximum level to which a new compacted memtable is pushed if it + // does not create overlap. + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0; + virtual int MaxMemCompactionLevel() { + return MaxMemCompactionLevel(DefaultColumnFamily()); + } + + // Number of files in level-0 that would stop writes. + virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0; + virtual int Level0StopWriteTrigger() { + return Level0StopWriteTrigger(DefaultColumnFamily()); + } + + // Get DB name -- the exact same name that was provided as an argument to + // DB::Open() + virtual const std::string& GetName() const = 0; + + // Get Env object from the DB + virtual Env* GetEnv() const = 0; + + virtual FileSystem* GetFileSystem() const; + + // Get DB Options that we use. During the process of opening the + // column family, the options provided when calling DB::Open() or + // DB::CreateColumnFamily() will have been "sanitized" and transformed + // in an implementation-defined manner. + virtual Options GetOptions(ColumnFamilyHandle* column_family) const = 0; + virtual Options GetOptions() const { + return GetOptions(DefaultColumnFamily()); + } + + virtual DBOptions GetDBOptions() const = 0; + + // Flush all mem-table data. + // Flush a single column family, even when atomic flush is enabled. To flush + // multiple column families, use Flush(options, column_families). + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) = 0; + virtual Status Flush(const FlushOptions& options) { + return Flush(options, DefaultColumnFamily()); + } + // Flushes multiple column families. + // If atomic flush is not enabled, Flush(options, column_families) is + // equivalent to calling Flush(options, column_family) multiple times. + // If atomic flush is enabled, Flush(options, column_families) will flush all + // column families specified in 'column_families' up to the latest sequence + // number at the time when flush is requested. + // Note that RocksDB 5.15 and earlier may not be able to open later versions + // with atomic flush enabled. + virtual Status Flush( + const FlushOptions& options, + const std::vector<ColumnFamilyHandle*>& column_families) = 0; + + // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL + // afterwards. + virtual Status FlushWAL(bool /*sync*/) { + return Status::NotSupported("FlushWAL not implemented"); + } + // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the + // same as Write() with sync=true: in the latter case the changes won't be + // visible until the sync is done. + // Currently only works if allow_mmap_writes = false in Options. + virtual Status SyncWAL() = 0; + + // Lock the WAL. Also flushes the WAL after locking. + virtual Status LockWAL() { + return Status::NotSupported("LockWAL not implemented"); + } + + // Unlock the WAL. + virtual Status UnlockWAL() { + return Status::NotSupported("UnlockWAL not implemented"); + } + + // The sequence number of the most recent transaction. + virtual SequenceNumber GetLatestSequenceNumber() const = 0; + + // Instructs DB to preserve deletes with sequence numbers >= passed seqnum. + // Has no effect if DBOptions.preserve_deletes is set to false. + // This function assumes that user calls this function with monotonically + // increasing seqnums (otherwise we can't guarantee that a particular delete + // hasn't been already processed); returns true if the value was successfully + // updated, false if user attempted to call if with seqnum <= current value. + virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) = 0; + +#ifndef ROCKSDB_LITE + + // Prevent file deletions. Compactions will continue to occur, + // but no obsolete files will be deleted. Calling this multiple + // times have the same effect as calling it once. + virtual Status DisableFileDeletions() = 0; + + // Allow compactions to delete obsolete files. + // If force == true, the call to EnableFileDeletions() will guarantee that + // file deletions are enabled after the call, even if DisableFileDeletions() + // was called multiple times before. + // If force == false, EnableFileDeletions will only enable file deletion + // after it's been called at least as many times as DisableFileDeletions(), + // enabling the two methods to be called by two threads concurrently without + // synchronization -- i.e., file deletions will be enabled only after both + // threads call EnableFileDeletions() + virtual Status EnableFileDeletions(bool force = true) = 0; + + // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup + + // Retrieve the list of all files in the database. The files are + // relative to the dbname and are not absolute paths. Despite being relative + // paths, the file names begin with "/". The valid size of the manifest file + // is returned in manifest_file_size. The manifest file is an ever growing + // file, but only the portion specified by manifest_file_size is valid for + // this snapshot. Setting flush_memtable to true does Flush before recording + // the live files. Setting flush_memtable to false is useful when we don't + // want to wait for flush which may have to wait for compaction to complete + // taking an indeterminate time. + // + // In case you have multiple column families, even if flush_memtable is true, + // you still need to call GetSortedWalFiles after GetLiveFiles to compensate + // for new data that arrived to already-flushed column families while other + // column families were flushing + virtual Status GetLiveFiles(std::vector<std::string>&, + uint64_t* manifest_file_size, + bool flush_memtable = true) = 0; + + // Retrieve the sorted list of all wal files with earliest file first + virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0; + + // Retrieve information about the current wal file + // + // Note that the log might have rolled after this call in which case + // the current_log_file would not point to the current log file. + // + // Additionally, for the sake of optimization current_log_file->StartSequence + // would always be set to 0 + virtual Status GetCurrentWalFile( + std::unique_ptr<LogFile>* current_log_file) = 0; + + // Retrieves the creation time of the oldest file in the DB. + // This API only works if max_open_files = -1, if it is not then + // Status returned is Status::NotSupported() + // The file creation time is set using the env provided to the DB. + // If the DB was created from a very old release then its possible that + // the SST files might not have file_creation_time property and even after + // moving to a newer release its possible that some files never got compacted + // and may not have file_creation_time property. In both the cases + // file_creation_time is considered 0 which means this API will return + // creation_time = 0 as there wouldn't be a timestamp lower than 0. + virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0; + + // Note: this API is not yet consistent with WritePrepared transactions. + // Sets iter to an iterator that is positioned at a write-batch containing + // seq_number. If the sequence number is non existent, it returns an iterator + // at the first available seq_no after the requested seq_no + // Returns Status::OK if iterator is valid + // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to + // use this api, else the WAL files will get + // cleared aggressively and the iterator might keep getting invalid before + // an update is read. + virtual Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter, + const TransactionLogIterator::ReadOptions& read_options = + TransactionLogIterator::ReadOptions()) = 0; + +// Windows API macro interference +#undef DeleteFile + // Delete the file name from the db directory and update the internal state to + // reflect that. Supports deletion of sst and log files only. 'name' must be + // path relative to the db directory. eg. 000001.sst, /archive/000003.log + virtual Status DeleteFile(std::string name) = 0; + + // Returns a list of all table files with their level, start key + // and end key + virtual void GetLiveFilesMetaData( + std::vector<LiveFileMetaData>* /*metadata*/) {} + + // Obtains the meta data of the specified column family of the DB. + virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/, + ColumnFamilyMetaData* /*metadata*/) {} + + // Get the metadata of the default column family. + void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) { + GetColumnFamilyMetaData(DefaultColumnFamily(), metadata); + } + + // IngestExternalFile() will load a list of external SST files (1) into the DB + // Two primary modes are supported: + // - Duplicate keys in the new files will overwrite exiting keys (default) + // - Duplicate keys will be skipped (set ingest_behind=true) + // In the first mode we will try to find the lowest possible level that + // the file can fit in, and ingest the file into this level (2). A file that + // have a key range that overlap with the memtable key range will require us + // to Flush the memtable first before ingesting the file. + // In the second mode we will always ingest in the bottom most level (see + // docs to IngestExternalFileOptions::ingest_behind). + // + // (1) External SST files can be created using SstFileWriter + // (2) We will try to ingest the files to the lowest possible level + // even if the file compression doesn't match the level compression + // (3) If IngestExternalFileOptions->ingest_behind is set to true, + // we always ingest at the bottommost level, which should be reserved + // for this purpose (see DBOPtions::allow_ingest_behind flag). + virtual Status IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector<std::string>& external_files, + const IngestExternalFileOptions& options) = 0; + + virtual Status IngestExternalFile( + const std::vector<std::string>& external_files, + const IngestExternalFileOptions& options) { + return IngestExternalFile(DefaultColumnFamily(), external_files, options); + } + + // IngestExternalFiles() will ingest files for multiple column families, and + // record the result atomically to the MANIFEST. + // If this function returns OK, all column families' ingestion must succeed. + // If this function returns NOK, or the process crashes, then non-of the + // files will be ingested into the database after recovery. + // Note that it is possible for application to observe a mixed state during + // the execution of this function. If the user performs range scan over the + // column families with iterators, iterator on one column family may return + // ingested data, while iterator on other column family returns old data. + // Users can use snapshot for a consistent view of data. + // If your db ingests multiple SST files using this API, i.e. args.size() + // > 1, then RocksDB 5.15 and earlier will not be able to open it. + // + // REQUIRES: each arg corresponds to a different column family: namely, for + // 0 <= i < j < len(args), args[i].column_family != args[j].column_family. + virtual Status IngestExternalFiles( + const std::vector<IngestExternalFileArg>& args) = 0; + + // CreateColumnFamilyWithImport() will create a new column family with + // column_family_name and import external SST files specified in metadata into + // this column family. + // (1) External SST files can be created using SstFileWriter. + // (2) External SST files can be exported from a particular column family in + // an existing DB. + // Option in import_options specifies whether the external files are copied or + // moved (default is copy). When option specifies copy, managing files at + // external_file_path is caller's responsibility. When option specifies a + // move, the call ensures that the specified files at external_file_path are + // deleted on successful return and files are not modified on any error + // return. + // On error return, column family handle returned will be nullptr. + // ColumnFamily will be present on successful return and will not be present + // on error return. ColumnFamily may be present on any crash during this call. + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) = 0; + + virtual Status VerifyChecksum(const ReadOptions& read_options) = 0; + + virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } + + // AddFile() is deprecated, please use IngestExternalFile() + ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( + ColumnFamilyHandle* column_family, + const std::vector<std::string>& file_path_list, bool move_file = false, + bool skip_snapshot_check = false) { + IngestExternalFileOptions ifo; + ifo.move_files = move_file; + ifo.snapshot_consistency = !skip_snapshot_check; + ifo.allow_global_seqno = false; + ifo.allow_blocking_flush = false; + return IngestExternalFile(column_family, file_path_list, ifo); + } + + ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( + const std::vector<std::string>& file_path_list, bool move_file = false, + bool skip_snapshot_check = false) { + IngestExternalFileOptions ifo; + ifo.move_files = move_file; + ifo.snapshot_consistency = !skip_snapshot_check; + ifo.allow_global_seqno = false; + ifo.allow_blocking_flush = false; + return IngestExternalFile(DefaultColumnFamily(), file_path_list, ifo); + } + + // AddFile() is deprecated, please use IngestExternalFile() + ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( + ColumnFamilyHandle* column_family, const std::string& file_path, + bool move_file = false, bool skip_snapshot_check = false) { + IngestExternalFileOptions ifo; + ifo.move_files = move_file; + ifo.snapshot_consistency = !skip_snapshot_check; + ifo.allow_global_seqno = false; + ifo.allow_blocking_flush = false; + return IngestExternalFile(column_family, {file_path}, ifo); + } + + ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( + const std::string& file_path, bool move_file = false, + bool skip_snapshot_check = false) { + IngestExternalFileOptions ifo; + ifo.move_files = move_file; + ifo.snapshot_consistency = !skip_snapshot_check; + ifo.allow_global_seqno = false; + ifo.allow_blocking_flush = false; + return IngestExternalFile(DefaultColumnFamily(), {file_path}, ifo); + } + + // Load table file with information "file_info" into "column_family" + ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( + ColumnFamilyHandle* column_family, + const std::vector<ExternalSstFileInfo>& file_info_list, + bool move_file = false, bool skip_snapshot_check = false) { + std::vector<std::string> external_files; + for (const ExternalSstFileInfo& file_info : file_info_list) { + external_files.push_back(file_info.file_path); + } + IngestExternalFileOptions ifo; + ifo.move_files = move_file; + ifo.snapshot_consistency = !skip_snapshot_check; + ifo.allow_global_seqno = false; + ifo.allow_blocking_flush = false; + return IngestExternalFile(column_family, external_files, ifo); + } + + ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( + const std::vector<ExternalSstFileInfo>& file_info_list, + bool move_file = false, bool skip_snapshot_check = false) { + std::vector<std::string> external_files; + for (const ExternalSstFileInfo& file_info : file_info_list) { + external_files.push_back(file_info.file_path); + } + IngestExternalFileOptions ifo; + ifo.move_files = move_file; + ifo.snapshot_consistency = !skip_snapshot_check; + ifo.allow_global_seqno = false; + ifo.allow_blocking_flush = false; + return IngestExternalFile(DefaultColumnFamily(), external_files, ifo); + } + + ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( + ColumnFamilyHandle* column_family, const ExternalSstFileInfo* file_info, + bool move_file = false, bool skip_snapshot_check = false) { + IngestExternalFileOptions ifo; + ifo.move_files = move_file; + ifo.snapshot_consistency = !skip_snapshot_check; + ifo.allow_global_seqno = false; + ifo.allow_blocking_flush = false; + return IngestExternalFile(column_family, {file_info->file_path}, ifo); + } + + ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( + const ExternalSstFileInfo* file_info, bool move_file = false, + bool skip_snapshot_check = false) { + IngestExternalFileOptions ifo; + ifo.move_files = move_file; + ifo.snapshot_consistency = !skip_snapshot_check; + ifo.allow_global_seqno = false; + ifo.allow_blocking_flush = false; + return IngestExternalFile(DefaultColumnFamily(), {file_info->file_path}, + ifo); + } + +#endif // ROCKSDB_LITE + + // Returns the unique ID which is read from IDENTITY file during the opening + // of database by setting in the identity variable + // Returns Status::OK if identity could be set properly + virtual Status GetDbIdentity(std::string& identity) const = 0; + + // Returns default column family handle + virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0; + +#ifndef ROCKSDB_LITE + virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) = 0; + virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { + return GetPropertiesOfAllTables(DefaultColumnFamily(), props); + } + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) = 0; + + virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) { + return Status::NotSupported("SuggestCompactRange() is not implemented."); + } + + virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/, + int /*target_level*/) { + return Status::NotSupported("PromoteL0() is not implemented."); + } + + // Trace DB operations. Use EndTrace() to stop tracing. + virtual Status StartTrace(const TraceOptions& /*options*/, + std::unique_ptr<TraceWriter>&& /*trace_writer*/) { + return Status::NotSupported("StartTrace() is not implemented."); + } + + virtual Status EndTrace() { + return Status::NotSupported("EndTrace() is not implemented."); + } + + // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing. + virtual Status StartBlockCacheTrace( + const TraceOptions& /*options*/, + std::unique_ptr<TraceWriter>&& /*trace_writer*/) { + return Status::NotSupported("StartBlockCacheTrace() is not implemented."); + } + + virtual Status EndBlockCacheTrace() { + return Status::NotSupported("EndBlockCacheTrace() is not implemented."); + } +#endif // ROCKSDB_LITE + + // Needed for StackableDB + virtual DB* GetRootDB() { return this; } + + // Given a window [start_time, end_time), setup a StatsHistoryIterator + // to access stats history. Note the start_time and end_time are epoch + // time measured in seconds, and end_time is an exclusive bound. + virtual Status GetStatsHistory( + uint64_t /*start_time*/, uint64_t /*end_time*/, + std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) { + return Status::NotSupported("GetStatsHistory() is not implemented."); + } + +#ifndef ROCKSDB_LITE + // Make the secondary instance catch up with the primary by tailing and + // replaying the MANIFEST and WAL of the primary. + // Column families created by the primary after the secondary instance starts + // will be ignored unless the secondary instance closes and restarts with the + // newly created column families. + // Column families that exist before secondary instance starts and dropped by + // the primary afterwards will be marked as dropped. However, as long as the + // secondary instance does not delete the corresponding column family + // handles, the data of the column family is still accessible to the + // secondary. + // TODO: we will support WAL tailing soon. + virtual Status TryCatchUpWithPrimary() { + return Status::NotSupported("Supported only by secondary instance"); + } +#endif // !ROCKSDB_LITE +}; + +// Destroy the contents of the specified database. +// Be very careful using this method. +Status DestroyDB(const std::string& name, const Options& options, + const std::vector<ColumnFamilyDescriptor>& column_families = + std::vector<ColumnFamilyDescriptor>()); + +#ifndef ROCKSDB_LITE +// If a DB cannot be opened, you may attempt to call this method to +// resurrect as much of the contents of the database as possible. +// Some data may be lost, so be careful when calling this function +// on a database that contains important information. +// +// With this API, we will warn and skip data associated with column families not +// specified in column_families. +// +// @param column_families Descriptors for known column families +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector<ColumnFamilyDescriptor>& column_families); + +// @param unknown_cf_opts Options for column families encountered during the +// repair that were not specified in column_families. +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector<ColumnFamilyDescriptor>& column_families, + const ColumnFamilyOptions& unknown_cf_opts); + +// @param options These options will be used for the database and for ALL column +// families encountered during the repair +Status RepairDB(const std::string& dbname, const Options& options); + +#endif + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/db_bench_tool.h b/src/rocksdb/include/rocksdb/db_bench_tool.h new file mode 100644 index 000000000..17f4e6bde --- /dev/null +++ b/src/rocksdb/include/rocksdb/db_bench_tool.h @@ -0,0 +1,11 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +int db_bench_tool(int argc, char** argv); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/db_dump_tool.h b/src/rocksdb/include/rocksdb/db_dump_tool.h new file mode 100644 index 000000000..b7d4766a2 --- /dev/null +++ b/src/rocksdb/include/rocksdb/db_dump_tool.h @@ -0,0 +1,45 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <string> + +#include "rocksdb/db.h" + +namespace ROCKSDB_NAMESPACE { + +struct DumpOptions { + // Database that will be dumped + std::string db_path; + // File location that will contain dump output + std::string dump_location; + // Don't include db information header in the dump + bool anonymous = false; +}; + +class DbDumpTool { + public: + bool Run(const DumpOptions& dump_options, + ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options()); +}; + +struct UndumpOptions { + // Database that we will load the dumped file into + std::string db_path; + // File location of the dumped file that will be loaded + std::string dump_location; + // Compact the db after loading the dumped file + bool compact_db = false; +}; + +class DbUndumpTool { + public: + bool Run(const UndumpOptions& undump_options, + ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options()); +}; +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/db_stress_tool.h b/src/rocksdb/include/rocksdb/db_stress_tool.h new file mode 100644 index 000000000..7d3d42c9d --- /dev/null +++ b/src/rocksdb/include/rocksdb/db_stress_tool.h @@ -0,0 +1,11 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +int db_stress_tool(int argc, char** argv); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/env.h b/src/rocksdb/include/rocksdb/env.h new file mode 100644 index 000000000..056d8a1c0 --- /dev/null +++ b/src/rocksdb/include/rocksdb/env.h @@ -0,0 +1,1589 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the rocksdb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All Env implementations are safe for concurrent access from +// multiple threads without any external synchronization. + +#pragma once + +#include <stdint.h> +#include <cstdarg> +#include <functional> +#include <limits> +#include <memory> +#include <string> +#include <vector> +#include "rocksdb/status.h" +#include "rocksdb/thread_status.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef DeleteFile +#undef GetCurrentTime +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \ + __attribute__((__format__(__printf__, format_param, dots_param))) +#else +#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) +#endif + +namespace ROCKSDB_NAMESPACE { + +class DynamicLibrary; +class FileLock; +class Logger; +class RandomAccessFile; +class SequentialFile; +class Slice; +class WritableFile; +class RandomRWFile; +class MemoryMappedFileBuffer; +class Directory; +struct DBOptions; +struct ImmutableDBOptions; +struct MutableDBOptions; +class RateLimiter; +class ThreadStatusUpdater; +struct ThreadStatus; + +const size_t kDefaultPageSize = 4 * 1024; + +// Options while opening a file to read/write +struct EnvOptions { + // Construct with default Options + EnvOptions(); + + // Construct from Options + explicit EnvOptions(const DBOptions& options); + + // If true, then use mmap to read data + bool use_mmap_reads = false; + + // If true, then use mmap to write data + bool use_mmap_writes = true; + + // If true, then use O_DIRECT for reading data + bool use_direct_reads = false; + + // If true, then use O_DIRECT for writing data + bool use_direct_writes = false; + + // If false, fallocate() calls are bypassed + bool allow_fallocate = true; + + // If true, set the FD_CLOEXEC on open fd. + bool set_fd_cloexec = true; + + // Allows OS to incrementally sync files to disk while they are being + // written, in the background. Issue one request for every bytes_per_sync + // written. 0 turns it off. + // Default: 0 + uint64_t bytes_per_sync = 0; + + // When true, guarantees the file has at most `bytes_per_sync` bytes submitted + // for writeback at any given time. + // + // - If `sync_file_range` is supported it achieves this by waiting for any + // prior `sync_file_range`s to finish before proceeding. In this way, + // processing (compression, etc.) can proceed uninhibited in the gap + // between `sync_file_range`s, and we block only when I/O falls behind. + // - Otherwise the `WritableFile::Sync` method is used. Note this mechanism + // always blocks, thus preventing the interleaving of I/O and processing. + // + // Note: Enabling this option does not provide any additional persistence + // guarantees, as it may use `sync_file_range`, which does not write out + // metadata. + // + // Default: false + bool strict_bytes_per_sync = false; + + // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which + // means that file size won't change as part of preallocation. + // If false, preallocation will also change the file size. This option will + // improve the performance in workloads where you sync the data on every + // write. By default, we set it to true for MANIFEST writes and false for + // WAL writes + bool fallocate_with_keep_size = true; + + // See DBOptions doc + size_t compaction_readahead_size = 0; + + // See DBOptions doc + size_t random_access_max_buffer_size = 0; + + // See DBOptions doc + size_t writable_file_max_buffer_size = 1024 * 1024; + + // If not nullptr, write rate limiting is enabled for flush and compaction + RateLimiter* rate_limiter = nullptr; +}; + +class Env { + public: + struct FileAttributes { + // File name + std::string name; + + // Size of file in bytes + uint64_t size_bytes; + }; + + Env() : thread_status_updater_(nullptr) {} + // No copying allowed + Env(const Env&) = delete; + void operator=(const Env&) = delete; + + virtual ~Env(); + + static const char* Type() { return "Environment"; } + + // Loads the environment specified by the input value into the result + static Status LoadEnv(const std::string& value, Env** result); + + // Loads the environment specified by the input value into the result + static Status LoadEnv(const std::string& value, Env** result, + std::shared_ptr<Env>* guard); + + // Return a default environment suitable for the current operating + // system. Sophisticated users may wish to provide their own Env + // implementation instead of relying on this default environment. + // + // The result of Default() belongs to rocksdb and must never be deleted. + static Env* Default(); + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores nullptr in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewSequentialFile(const std::string& fname, + std::unique_ptr<SequentialFile>* result, + const EnvOptions& options) = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr<RandomAccessFile>* result, + const EnvOptions& options) = 0; + // These values match Linux definition + // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 + enum WriteLifeTimeHint { + WLTH_NOT_SET = 0, // No hint information set + WLTH_NONE, // No hints about write life time + WLTH_SHORT, // Data written has a short life time + WLTH_MEDIUM, // Data written has a medium life time + WLTH_LONG, // Data written has a long life time + WLTH_EXTREME, // Data written has an extremely long life time + }; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWritableFile(const std::string& fname, + std::unique_ptr<WritableFile>* result, + const EnvOptions& options) = 0; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status ReopenWritableFile(const std::string& /*fname*/, + std::unique_ptr<WritableFile>* /*result*/, + const EnvOptions& /*options*/) { + return Status::NotSupported(); + } + + // Reuse an existing file by renaming it and opening it as writable. + virtual Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr<WritableFile>* result, + const EnvOptions& options); + + // Open `fname` for random read and write, if file doesn't exist the file + // will be created. On success, stores a pointer to the new file in + // *result and returns OK. On failure returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewRandomRWFile(const std::string& /*fname*/, + std::unique_ptr<RandomRWFile>* /*result*/, + const EnvOptions& /*options*/) { + return Status::NotSupported("RandomRWFile is not implemented in this Env"); + } + + // Opens `fname` as a memory-mapped file for read and write (in-place updates + // only, i.e., no appends). On success, stores a raw buffer covering the whole + // file in `*result`. The file must exist prior to this call. + virtual Status NewMemoryMappedFileBuffer( + const std::string& /*fname*/, + std::unique_ptr<MemoryMappedFileBuffer>* /*result*/) { + return Status::NotSupported( + "MemoryMappedFileBuffer is not implemented in this Env"); + } + + // Create an object that represents a directory. Will fail if directory + // doesn't exist. If the directory exists, it will open the directory + // and create a new Directory object. + // + // On success, stores a pointer to the new Directory in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + virtual Status NewDirectory(const std::string& name, + std::unique_ptr<Directory>* result) = 0; + + // Returns OK if the named file exists. + // NotFound if the named file does not exist, + // the calling process does not have permission to determine + // whether this file exists, or if the path is invalid. + // IOError if an IO Error was encountered + virtual Status FileExists(const std::string& fname) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + virtual Status GetChildren(const std::string& dir, + std::vector<std::string>* result) = 0; + + // Store in *result the attributes of the children of the specified directory. + // In case the implementation lists the directory prior to iterating the files + // and files are concurrently deleted, the deleted files will be omitted from + // result. + // The name attributes are relative to "dir". + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + virtual Status GetChildrenFileAttributes(const std::string& dir, + std::vector<FileAttributes>* result); + + // Delete the named file. + virtual Status DeleteFile(const std::string& fname) = 0; + + // Truncate the named file to the specified size. + virtual Status Truncate(const std::string& /*fname*/, size_t /*size*/) { + return Status::NotSupported("Truncate is not supported for this Env"); + } + + // Create the specified directory. Returns error if directory exists. + virtual Status CreateDir(const std::string& dirname) = 0; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + virtual Status CreateDirIfMissing(const std::string& dirname) = 0; + + // Delete the specified directory. + virtual Status DeleteDir(const std::string& dirname) = 0; + + // Store the size of fname in *file_size. + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; + + // Store the last modification time of fname in *file_mtime. + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) = 0; + // Rename file src to target. + virtual Status RenameFile(const std::string& src, + const std::string& target) = 0; + + // Hard Link file src to target. + virtual Status LinkFile(const std::string& /*src*/, + const std::string& /*target*/) { + return Status::NotSupported("LinkFile is not supported for this Env"); + } + + virtual Status NumFileLinks(const std::string& /*fname*/, + uint64_t* /*count*/) { + return Status::NotSupported( + "Getting number of file links is not supported for this Env"); + } + + virtual Status AreFilesSame(const std::string& /*first*/, + const std::string& /*second*/, bool* /*res*/) { + return Status::NotSupported("AreFilesSame is not supported for this Env"); + } + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores nullptr in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual Status UnlockFile(FileLock* lock) = 0; + + // Opens `lib_name` as a dynamic library. + // If the 'search_path' is specified, breaks the path into its components + // based on the appropriate platform separator (";" or ";") and looks for the + // library in those directories. If 'search path is not specified, uses the + // default library path search mechanism (such as LD_LIBRARY_PATH). On + // success, stores a dynamic library in `*result`. + virtual Status LoadLibrary(const std::string& /*lib_name*/, + const std::string& /*search_path */, + std::shared_ptr<DynamicLibrary>* /*result*/) { + return Status::NotSupported("LoadLibrary is not implemented in this Env"); + } + + // Priority for scheduling job in thread pool + enum Priority { BOTTOM, LOW, HIGH, USER, TOTAL }; + + static std::string PriorityToString(Priority priority); + + // Priority for requesting bytes in rate limiter scheduler + enum IOPriority { IO_LOW = 0, IO_HIGH = 1, IO_TOTAL = 2 }; + + // Arrange to run "(*function)(arg)" once in a background thread, in + // the thread pool specified by pri. By default, jobs go to the 'LOW' + // priority thread pool. + + // "function" may run in an unspecified thread. Multiple functions + // added to the same Env may run concurrently in different threads. + // I.e., the caller may not assume that background work items are + // serialized. + // When the UnSchedule function is called, the unschedFunction + // registered at the time of Schedule is invoked with arg as a parameter. + virtual void Schedule(void (*function)(void* arg), void* arg, + Priority pri = LOW, void* tag = nullptr, + void (*unschedFunction)(void* arg) = nullptr) = 0; + + // Arrange to remove jobs for given arg from the queue_ if they are not + // already scheduled. Caller is expected to have exclusive lock on arg. + virtual int UnSchedule(void* /*arg*/, Priority /*pri*/) { return 0; } + + // Start a new thread, invoking "function(arg)" within the new thread. + // When "function(arg)" returns, the thread will be destroyed. + virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + + // Wait for all threads started by StartThread to terminate. + virtual void WaitForJoin() {} + + // Get thread pool queue length for specific thread pool. + virtual unsigned int GetThreadPoolQueueLen(Priority /*pri*/ = LOW) const { + return 0; + } + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual Status GetTestDirectory(std::string* path) = 0; + + // Create and returns a default logger (an instance of EnvLogger) for storing + // informational messages. Derived classes can overide to provide custom + // logger. + virtual Status NewLogger(const std::string& fname, + std::shared_ptr<Logger>* result); + + // Returns the number of micro-seconds since some fixed point in time. + // It is often used as system time such as in GenericRateLimiter + // and other places so a port needs to return system time in order to work. + virtual uint64_t NowMicros() = 0; + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros. + // In platform-specific implementations, NowNanos() should return time points + // that are MONOTONIC. + virtual uint64_t NowNanos() { return NowMicros() * 1000; } + + // 0 indicates not supported. + virtual uint64_t NowCPUNanos() { return 0; } + + // Sleep/delay the thread for the prescribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + // Get the current host name. + virtual Status GetHostName(char* name, uint64_t len) = 0; + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + // Only overwrites *unix_time on success. + virtual Status GetCurrentTime(int64_t* unix_time) = 0; + + // Get full directory name for this db. + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) = 0; + + // The number of background worker threads of a specific thread pool + // for this environment. 'LOW' is the default pool. + // default number: 1 + virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0; + virtual int GetBackgroundThreads(Priority pri = LOW) = 0; + + virtual Status SetAllowNonOwnerAccess(bool /*allow_non_owner_access*/) { + return Status::NotSupported("Not supported."); + } + + // Enlarge number of background worker threads of a specific thread pool + // for this environment if it is smaller than specified. 'LOW' is the default + // pool. + virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) = 0; + + // Lower IO priority for threads from the specified pool. + virtual void LowerThreadPoolIOPriority(Priority /*pool*/ = LOW) {} + + // Lower CPU priority for threads from the specified pool. + virtual void LowerThreadPoolCPUPriority(Priority /*pool*/ = LOW) {} + + // Converts seconds-since-Jan-01-1970 to a printable string + virtual std::string TimeToString(uint64_t time) = 0; + + // Generates a unique id that can be used to identify a db + virtual std::string GenerateUniqueId(); + + // OptimizeForLogWrite will create a new EnvOptions object that is a copy of + // the EnvOptions in the parameters, but is optimized for reading log files. + virtual EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const; + + // OptimizeForManifestRead will create a new EnvOptions object that is a copy + // of the EnvOptions in the parameters, but is optimized for reading manifest + // files. + virtual EnvOptions OptimizeForManifestRead( + const EnvOptions& env_options) const; + + // OptimizeForLogWrite will create a new EnvOptions object that is a copy of + // the EnvOptions in the parameters, but is optimized for writing log files. + // Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const; + // OptimizeForManifestWrite will create a new EnvOptions object that is a copy + // of the EnvOptions in the parameters, but is optimized for writing manifest + // files. Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForManifestWrite( + const EnvOptions& env_options) const; + + // OptimizeForCompactionTableWrite will create a new EnvOptions object that is + // a copy of the EnvOptions in the parameters, but is optimized for writing + // table files. + virtual EnvOptions OptimizeForCompactionTableWrite( + const EnvOptions& env_options, + const ImmutableDBOptions& immutable_ops) const; + + // OptimizeForCompactionTableWrite will create a new EnvOptions object that + // is a copy of the EnvOptions in the parameters, but is optimized for reading + // table files. + virtual EnvOptions OptimizeForCompactionTableRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const; + + // Returns the status of all threads that belong to the current Env. + virtual Status GetThreadList(std::vector<ThreadStatus>* /*thread_list*/) { + return Status::NotSupported("Not supported."); + } + + // Returns the pointer to ThreadStatusUpdater. This function will be + // used in RocksDB internally to update thread status and supports + // GetThreadList(). + virtual ThreadStatusUpdater* GetThreadStatusUpdater() const { + return thread_status_updater_; + } + + // Returns the ID of the current thread. + virtual uint64_t GetThreadID() const; + +// This seems to clash with a macro on Windows, so #undef it here +#undef GetFreeSpace + + // Get the amount of free disk space + virtual Status GetFreeSpace(const std::string& /*path*/, + uint64_t* /*diskfree*/) { + return Status::NotSupported(); + } + + virtual void SanitizeEnvOptions(EnvOptions* /*env_opts*/) const {} + + // If you're adding methods here, remember to add them to EnvWrapper too. + + protected: + // The pointer to an internal structure that will update the + // status of each thread. + ThreadStatusUpdater* thread_status_updater_; +}; + +// The factory function to construct a ThreadStatusUpdater. Any Env +// that supports GetThreadList() feature should call this function in its +// constructor to initialize thread_status_updater_. +ThreadStatusUpdater* CreateThreadStatusUpdater(); + +// A file abstraction for reading sequentially through a file +class SequentialFile { + public: + SequentialFile() {} + virtual ~SequentialFile(); + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + virtual Status Read(size_t n, Slice* result, char* scratch) = 0; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + virtual Status Skip(uint64_t n) = 0; + + // Indicates the upper layers if the current SequentialFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return Status::NotSupported("InvalidateCache not supported."); + } + + // Positioned Read for direct I/O + // If Direct I/O enabled, offset, n, and scratch should be properly aligned + virtual Status PositionedRead(uint64_t /*offset*/, size_t /*n*/, + Slice* /*result*/, char* /*scratch*/) { + return Status::NotSupported(); + } + + // If you're adding methods here, remember to add them to + // SequentialFileWrapper too. +}; + +// A read IO request structure for use in MultiRead +struct ReadRequest { + // File offset in bytes + uint64_t offset; + + // Length to read in bytes + size_t len; + + // A buffer that MultiRead() can optionally place data in. It can + // ignore this and allocate its own buffer + char* scratch; + + // Output parameter set by MultiRead() to point to the data buffer, and + // the number of valid bytes + Slice result; + + // Status of read + Status status; +}; + +// A file abstraction for randomly reading the contents of a file. +class RandomAccessFile { + public: + RandomAccessFile() {} + virtual ~RandomAccessFile(); + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + // If Direct I/O enabled, offset, n, and scratch should be aligned properly. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; + + // Readahead the file starting from offset by n bytes for caching. + virtual Status Prefetch(uint64_t /*offset*/, size_t /*n*/) { + return Status::OK(); + } + + // Read a bunch of blocks as described by reqs. The blocks can + // optionally be read in parallel. This is a synchronous call, i.e it + // should return after all reads have completed. The reads will be + // non-overlapping. If the function return Status is not ok, status of + // individual requests will be ignored and return status will be assumed + // for all read requests. The function return status is only meant for any + // any errors that occur before even processing specific read requests + virtual Status MultiRead(ReadRequest* reqs, size_t num_reqs) { + assert(reqs != nullptr); + for (size_t i = 0; i < num_reqs; ++i) { + ReadRequest& req = reqs[i]; + req.status = Read(req.offset, req.len, &req.result, req.scratch); + } + return Status::OK(); + } + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to each other by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; // Default implementation to prevent issues with backwards + // compatibility. + } + + enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED }; + + virtual void Hint(AccessPattern /*pattern*/) {} + + // Indicates the upper layers if the current RandomAccessFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return Status::NotSupported("InvalidateCache not supported."); + } + + // If you're adding methods here, remember to add them to + // RandomAccessFileWrapper too. +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class WritableFile { + public: + WritableFile() + : last_preallocated_block_(0), + preallocation_block_size_(0), + io_priority_(Env::IO_TOTAL), + write_hint_(Env::WLTH_NOT_SET), + strict_bytes_per_sync_(false) {} + + explicit WritableFile(const EnvOptions& options) + : last_preallocated_block_(0), + preallocation_block_size_(0), + io_priority_(Env::IO_TOTAL), + write_hint_(Env::WLTH_NOT_SET), + strict_bytes_per_sync_(options.strict_bytes_per_sync) {} + // No copying allowed + WritableFile(const WritableFile&) = delete; + void operator=(const WritableFile&) = delete; + + virtual ~WritableFile(); + + // Append data to the end of the file + // Note: A WriteabelFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. + virtual Status Append(const Slice& data) = 0; + + // PositionedAppend data to the specified offset. The new EOF after append + // must be larger than the previous EOF. This is to be used when writes are + // not backed by OS buffers and hence has to always start from the start of + // the sector. The implementation thus needs to also rewrite the last + // partial sector. + // Note: PositionAppend does not guarantee moving the file offset after the + // write. A WritableFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. + // + // PositionedAppend() can only happen on the page/sector boundaries. For that + // reason, if the last write was an incomplete sector we still need to rewind + // back to the nearest sector/page and rewrite the portion of it with whatever + // we need to add. We need to keep where we stop writing. + // + // PositionedAppend() can only write whole sectors. For that reason we have to + // pad with zeros for the last write and trim the file when closing according + // to the position we keep in the previous step. + // + // PositionedAppend() requires aligned buffer to be passed in. The alignment + // required is queried via GetRequiredBufferAlignment() + virtual Status PositionedAppend(const Slice& /* data */, + uint64_t /* offset */) { + return Status::NotSupported(); + } + + // Truncate is necessary to trim the file to the correct size + // before closing. It is not always possible to keep track of the file + // size due to whole pages writes. The behavior is undefined if called + // with other writes to follow. + virtual Status Truncate(uint64_t /*size*/) { return Status::OK(); } + virtual Status Close() = 0; + virtual Status Flush() = 0; + virtual Status Sync() = 0; // sync data + + /* + * Sync data and/or metadata as well. + * By default, sync only data. + * Override this method for environments where we need to sync + * metadata as well. + */ + virtual Status Fsync() { return Sync(); } + + // true if Sync() and Fsync() are safe to call concurrently with Append() + // and Flush(). + virtual bool IsSyncThreadSafe() const { return false; } + + // Indicates the upper layers if the current WritableFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + /* + * Change the priority in rate limiter if rate limiting is enabled. + * If rate limiting is not enabled, this call has no effect. + */ + virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; } + + virtual Env::IOPriority GetIOPriority() { return io_priority_; } + + virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { + write_hint_ = hint; + } + + virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; } + /* + * Get the size of valid data in the file. + */ + virtual uint64_t GetFileSize() { return 0; } + + /* + * Get and set the default pre-allocation block size for writes to + * this file. If non-zero, then Allocate will be used to extend the + * underlying storage of a file (generally via fallocate) if the Env + * instance supports it. + */ + virtual void SetPreallocationBlockSize(size_t size) { + preallocation_block_size_ = size; + } + + virtual void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) { + *last_allocated_block = last_preallocated_block_; + *block_size = preallocation_block_size_; + } + + // For documentation, refer to RandomAccessFile::GetUniqueId() + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; // Default implementation to prevent issues with backwards + } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return Status::NotSupported("InvalidateCache not supported."); + } + + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + virtual Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) { + if (strict_bytes_per_sync_) { + return Sync(); + } + return Status::OK(); + } + + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + virtual void PrepareWrite(size_t offset, size_t len) { + if (preallocation_block_size_ == 0) { + return; + } + // If this write would cross one or more preallocation blocks, + // determine what the last preallocation block necessary to + // cover this write would be and Allocate to that point. + const auto block_size = preallocation_block_size_; + size_t new_last_preallocated_block = + (offset + len + block_size - 1) / block_size; + if (new_last_preallocated_block > last_preallocated_block_) { + size_t num_spanned_blocks = + new_last_preallocated_block - last_preallocated_block_; + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks); + last_preallocated_block_ = new_last_preallocated_block; + } + } + + // Pre-allocates space for a file. + virtual Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) { + return Status::OK(); + } + + // If you're adding methods here, remember to add them to + // WritableFileWrapper too. + + protected: + size_t preallocation_block_size() { return preallocation_block_size_; } + + private: + size_t last_preallocated_block_; + size_t preallocation_block_size_; + + protected: + Env::IOPriority io_priority_; + Env::WriteLifeTimeHint write_hint_; + const bool strict_bytes_per_sync_; +}; + +// A file abstraction for random reading and writing. +class RandomRWFile { + public: + RandomRWFile() {} + // No copying allowed + RandomRWFile(const RandomRWFile&) = delete; + RandomRWFile& operator=(const RandomRWFile&) = delete; + + virtual ~RandomRWFile() {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + virtual Status Write(uint64_t offset, const Slice& data) = 0; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // Returns Status::OK() on success. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; + + virtual Status Flush() = 0; + + virtual Status Sync() = 0; + + virtual Status Fsync() { return Sync(); } + + virtual Status Close() = 0; + + // If you're adding methods here, remember to add them to + // RandomRWFileWrapper too. +}; + +// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer. +// Subclasses should release the mapping upon destruction. +class MemoryMappedFileBuffer { + public: + MemoryMappedFileBuffer(void* _base, size_t _length) + : base_(_base), length_(_length) {} + + virtual ~MemoryMappedFileBuffer() = 0; + + // We do not want to unmap this twice. We can make this class + // movable if desired, however, since + MemoryMappedFileBuffer(const MemoryMappedFileBuffer&) = delete; + MemoryMappedFileBuffer& operator=(const MemoryMappedFileBuffer&) = delete; + + void* GetBase() const { return base_; } + size_t GetLen() const { return length_; } + + protected: + void* base_; + const size_t length_; +}; + +// Directory object represents collection of files and implements +// filesystem operations that can be executed on directories. +class Directory { + public: + virtual ~Directory() {} + // Fsync directory. Can be called concurrently from multiple threads. + virtual Status Fsync() = 0; + + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; + } + + // If you're adding methods here, remember to add them to + // DirectoryWrapper too. +}; + +enum InfoLogLevel : unsigned char { + DEBUG_LEVEL = 0, + INFO_LEVEL, + WARN_LEVEL, + ERROR_LEVEL, + FATAL_LEVEL, + HEADER_LEVEL, + NUM_INFO_LOG_LEVELS, +}; + +// An interface for writing log messages. +class Logger { + public: + size_t kDoNotSupportGetLogFileSize = (std::numeric_limits<size_t>::max)(); + + explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) + : closed_(false), log_level_(log_level) {} + // No copying allowed + Logger(const Logger&) = delete; + void operator=(const Logger&) = delete; + + virtual ~Logger(); + + // Close the log file. Must be called before destructor. If the return + // status is NotSupported(), it means the implementation does cleanup in + // the destructor + virtual Status Close(); + + // Write a header to the log file with the specified format + // It is recommended that you log all header information at the start of the + // application. But it is not enforced. + virtual void LogHeader(const char* format, va_list ap) { + // Default implementation does a simple INFO level log write. + // Please override as per the logger class requirement. + Logv(format, ap); + } + + // Write an entry to the log file with the specified format. + virtual void Logv(const char* format, va_list ap) = 0; + + // Write an entry to the log file with the specified log level + // and format. Any log with level under the internal log level + // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be + // printed. + virtual void Logv(const InfoLogLevel log_level, const char* format, + va_list ap); + + virtual size_t GetLogFileSize() const { return kDoNotSupportGetLogFileSize; } + // Flush to the OS buffers + virtual void Flush() {} + virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; } + virtual void SetInfoLogLevel(const InfoLogLevel log_level) { + log_level_ = log_level; + } + + // If you're adding methods here, remember to add them to LoggerWrapper too. + + protected: + virtual Status CloseImpl(); + bool closed_; + + private: + InfoLogLevel log_level_; +}; + +// Identifies a locked file. +class FileLock { + public: + FileLock() {} + virtual ~FileLock(); + + private: + // No copying allowed + FileLock(const FileLock&) = delete; + void operator=(const FileLock&) = delete; +}; + +class DynamicLibrary { + public: + virtual ~DynamicLibrary() {} + + // Returns the name of the dynamic library. + virtual const char* Name() const = 0; + + // Loads the symbol for sym_name from the library and updates the input + // function. Returns the loaded symbol. + template <typename T> + Status LoadFunction(const std::string& sym_name, std::function<T>* function) { + assert(nullptr != function); + void* ptr = nullptr; + Status s = LoadSymbol(sym_name, &ptr); + *function = reinterpret_cast<T*>(ptr); + return s; + } + // Loads and returns the symbol for sym_name from the library. + virtual Status LoadSymbol(const std::string& sym_name, void** func) = 0; +}; + +extern void LogFlush(const std::shared_ptr<Logger>& info_log); + +extern void Log(const InfoLogLevel log_level, + const std::shared_ptr<Logger>& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4); + +// a set of log functions with different log levels. +extern void Header(const std::shared_ptr<Logger>& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Debug(const std::shared_ptr<Logger>& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Info(const std::shared_ptr<Logger>& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Warn(const std::shared_ptr<Logger>& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Error(const std::shared_ptr<Logger>& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Fatal(const std::shared_ptr<Logger>& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); + +// Log the specified data to *info_log if info_log is non-nullptr. +// The default info log level is InfoLogLevel::INFO_LEVEL. +extern void Log(const std::shared_ptr<Logger>& info_log, const char* format, + ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); + +extern void LogFlush(Logger* info_log); + +extern void Log(const InfoLogLevel log_level, Logger* info_log, + const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4); + +// The default info log level is InfoLogLevel::INFO_LEVEL. +extern void Log(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); + +// a set of log functions with different log levels. +extern void Header(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Debug(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Info(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Warn(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Error(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); +extern void Fatal(Logger* info_log, const char* format, ...) + ROCKSDB_PRINTF_FORMAT_ATTR(2, 3); + +// A utility routine: write "data" to the named file. +extern Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname, + bool should_sync = false); + +// A utility routine: read contents of named file into *data +extern Status ReadFileToString(Env* env, const std::string& fname, + std::string* data); + +// Below are helpers for wrapping most of the classes in this file. +// They forward all calls to another instance of the class. +// Useful when wrapping the default implementations. +// Typical usage is to inherit your wrapper from *Wrapper, e.g.: +// +// class MySequentialFileWrapper : public +// ROCKSDB_NAMESPACE::SequentialFileWrapper { +// public: +// MySequentialFileWrapper(ROCKSDB_NAMESPACE::SequentialFile* target): +// ROCKSDB_NAMESPACE::SequentialFileWrapper(target) {} +// Status Read(size_t n, Slice* result, char* scratch) override { +// cout << "Doing a read of size " << n << "!" << endl; +// return ROCKSDB_NAMESPACE::SequentialFileWrapper::Read(n, result, +// scratch); +// } +// // All other methods are forwarded to target_ automatically. +// }; +// +// This is often more convenient than inheriting the class directly because +// (a) Don't have to override and forward all methods - the Wrapper will +// forward everything you're not explicitly overriding. +// (b) Don't need to update the wrapper when more methods are added to the +// rocksdb class. Unless you actually want to override the behavior. +// (And unless rocksdb people forgot to update the *Wrapper class.) + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class EnvWrapper : public Env { + public: + // Initialize an EnvWrapper that delegates all calls to *t + explicit EnvWrapper(Env* t) : target_(t) {} + ~EnvWrapper() override; + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, + std::unique_ptr<SequentialFile>* r, + const EnvOptions& options) override { + return target_->NewSequentialFile(f, r, options); + } + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr<RandomAccessFile>* r, + const EnvOptions& options) override { + return target_->NewRandomAccessFile(f, r, options); + } + Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r, + const EnvOptions& options) override { + return target_->NewWritableFile(f, r, options); + } + Status ReopenWritableFile(const std::string& fname, + std::unique_ptr<WritableFile>* result, + const EnvOptions& options) override { + return target_->ReopenWritableFile(fname, result, options); + } + Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr<WritableFile>* r, + const EnvOptions& options) override { + return target_->ReuseWritableFile(fname, old_fname, r, options); + } + Status NewRandomRWFile(const std::string& fname, + std::unique_ptr<RandomRWFile>* result, + const EnvOptions& options) override { + return target_->NewRandomRWFile(fname, result, options); + } + Status NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr<MemoryMappedFileBuffer>* result) override { + return target_->NewMemoryMappedFileBuffer(fname, result); + } + Status NewDirectory(const std::string& name, + std::unique_ptr<Directory>* result) override { + return target_->NewDirectory(name, result); + } + Status FileExists(const std::string& f) override { + return target_->FileExists(f); + } + Status GetChildren(const std::string& dir, + std::vector<std::string>* r) override { + return target_->GetChildren(dir, r); + } + Status GetChildrenFileAttributes( + const std::string& dir, std::vector<FileAttributes>* result) override { + return target_->GetChildrenFileAttributes(dir, result); + } + Status DeleteFile(const std::string& f) override { + return target_->DeleteFile(f); + } + Status Truncate(const std::string& fname, size_t size) override { + return target_->Truncate(fname, size); + } + Status CreateDir(const std::string& d) override { + return target_->CreateDir(d); + } + Status CreateDirIfMissing(const std::string& d) override { + return target_->CreateDirIfMissing(d); + } + Status DeleteDir(const std::string& d) override { + return target_->DeleteDir(d); + } + Status GetFileSize(const std::string& f, uint64_t* s) override { + return target_->GetFileSize(f, s); + } + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) override { + return target_->GetFileModificationTime(fname, file_mtime); + } + + Status RenameFile(const std::string& s, const std::string& t) override { + return target_->RenameFile(s, t); + } + + Status LinkFile(const std::string& s, const std::string& t) override { + return target_->LinkFile(s, t); + } + + Status NumFileLinks(const std::string& fname, uint64_t* count) override { + return target_->NumFileLinks(fname, count); + } + + Status AreFilesSame(const std::string& first, const std::string& second, + bool* res) override { + return target_->AreFilesSame(first, second, res); + } + + Status LockFile(const std::string& f, FileLock** l) override { + return target_->LockFile(f, l); + } + + Status UnlockFile(FileLock* l) override { return target_->UnlockFile(l); } + + Status LoadLibrary(const std::string& lib_name, + const std::string& search_path, + std::shared_ptr<DynamicLibrary>* result) override { + return target_->LoadLibrary(lib_name, search_path, result); + } + + void Schedule(void (*f)(void* arg), void* a, Priority pri, + void* tag = nullptr, void (*u)(void* arg) = nullptr) override { + return target_->Schedule(f, a, pri, tag, u); + } + + int UnSchedule(void* tag, Priority pri) override { + return target_->UnSchedule(tag, pri); + } + + void StartThread(void (*f)(void*), void* a) override { + return target_->StartThread(f, a); + } + void WaitForJoin() override { return target_->WaitForJoin(); } + unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override { + return target_->GetThreadPoolQueueLen(pri); + } + Status GetTestDirectory(std::string* path) override { + return target_->GetTestDirectory(path); + } + Status NewLogger(const std::string& fname, + std::shared_ptr<Logger>* result) override { + return target_->NewLogger(fname, result); + } + uint64_t NowMicros() override { return target_->NowMicros(); } + uint64_t NowNanos() override { return target_->NowNanos(); } + uint64_t NowCPUNanos() override { return target_->NowCPUNanos(); } + + void SleepForMicroseconds(int micros) override { + target_->SleepForMicroseconds(micros); + } + Status GetHostName(char* name, uint64_t len) override { + return target_->GetHostName(name, len); + } + Status GetCurrentTime(int64_t* unix_time) override { + return target_->GetCurrentTime(unix_time); + } + Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) override { + return target_->GetAbsolutePath(db_path, output_path); + } + void SetBackgroundThreads(int num, Priority pri) override { + return target_->SetBackgroundThreads(num, pri); + } + int GetBackgroundThreads(Priority pri) override { + return target_->GetBackgroundThreads(pri); + } + + Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override { + return target_->SetAllowNonOwnerAccess(allow_non_owner_access); + } + + void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { + return target_->IncBackgroundThreadsIfNeeded(num, pri); + } + + void LowerThreadPoolIOPriority(Priority pool = LOW) override { + target_->LowerThreadPoolIOPriority(pool); + } + + void LowerThreadPoolCPUPriority(Priority pool = LOW) override { + target_->LowerThreadPoolCPUPriority(pool); + } + + std::string TimeToString(uint64_t time) override { + return target_->TimeToString(time); + } + + Status GetThreadList(std::vector<ThreadStatus>* thread_list) override { + return target_->GetThreadList(thread_list); + } + + ThreadStatusUpdater* GetThreadStatusUpdater() const override { + return target_->GetThreadStatusUpdater(); + } + + uint64_t GetThreadID() const override { return target_->GetThreadID(); } + + std::string GenerateUniqueId() override { + return target_->GenerateUniqueId(); + } + + EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override { + return target_->OptimizeForLogRead(env_options); + } + EnvOptions OptimizeForManifestRead( + const EnvOptions& env_options) const override { + return target_->OptimizeForManifestRead(env_options); + } + EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const override { + return target_->OptimizeForLogWrite(env_options, db_options); + } + EnvOptions OptimizeForManifestWrite( + const EnvOptions& env_options) const override { + return target_->OptimizeForManifestWrite(env_options); + } + EnvOptions OptimizeForCompactionTableWrite( + const EnvOptions& env_options, + const ImmutableDBOptions& immutable_ops) const override { + return target_->OptimizeForCompactionTableWrite(env_options, immutable_ops); + } + EnvOptions OptimizeForCompactionTableRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForCompactionTableRead(env_options, db_options); + } + Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override { + return target_->GetFreeSpace(path, diskfree); + } + void SanitizeEnvOptions(EnvOptions* env_opts) const override { + target_->SanitizeEnvOptions(env_opts); + } + + private: + Env* target_; +}; + +class SequentialFileWrapper : public SequentialFile { + public: + explicit SequentialFileWrapper(SequentialFile* target) : target_(target) {} + + Status Read(size_t n, Slice* result, char* scratch) override { + return target_->Read(n, result, scratch); + } + Status Skip(uint64_t n) override { return target_->Skip(n); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + Status PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) override { + return target_->PositionedRead(offset, n, result, scratch); + } + + private: + SequentialFile* target_; +}; + +class RandomAccessFileWrapper : public RandomAccessFile { + public: + explicit RandomAccessFileWrapper(RandomAccessFile* target) + : target_(target) {} + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + return target_->Read(offset, n, result, scratch); + } + Status MultiRead(ReadRequest* reqs, size_t num_reqs) override { + return target_->MultiRead(reqs, num_reqs); + } + Status Prefetch(uint64_t offset, size_t n) override { + return target_->Prefetch(offset, n); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + void Hint(AccessPattern pattern) override { target_->Hint(pattern); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + private: + RandomAccessFile* target_; +}; + +class WritableFileWrapper : public WritableFile { + public: + explicit WritableFileWrapper(WritableFile* t) : target_(t) {} + + Status Append(const Slice& data) override { return target_->Append(data); } + Status PositionedAppend(const Slice& data, uint64_t offset) override { + return target_->PositionedAppend(data, offset); + } + Status Truncate(uint64_t size) override { return target_->Truncate(size); } + Status Close() override { return target_->Close(); } + Status Flush() override { return target_->Flush(); } + Status Sync() override { return target_->Sync(); } + Status Fsync() override { return target_->Fsync(); } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetIOPriority(Env::IOPriority pri) override { + target_->SetIOPriority(pri); + } + + Env::IOPriority GetIOPriority() override { return target_->GetIOPriority(); } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize() override { return target_->GetFileSize(); } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + Status RangeSync(uint64_t offset, uint64_t nbytes) override { + return target_->RangeSync(offset, nbytes); + } + + void PrepareWrite(size_t offset, size_t len) override { + target_->PrepareWrite(offset, len); + } + + Status Allocate(uint64_t offset, uint64_t len) override { + return target_->Allocate(offset, len); + } + + private: + WritableFile* target_; +}; + +class RandomRWFileWrapper : public RandomRWFile { + public: + explicit RandomRWFileWrapper(RandomRWFile* target) : target_(target) {} + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status Write(uint64_t offset, const Slice& data) override { + return target_->Write(offset, data); + } + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + return target_->Read(offset, n, result, scratch); + } + Status Flush() override { return target_->Flush(); } + Status Sync() override { return target_->Sync(); } + Status Fsync() override { return target_->Fsync(); } + Status Close() override { return target_->Close(); } + + private: + RandomRWFile* target_; +}; + +class DirectoryWrapper : public Directory { + public: + explicit DirectoryWrapper(Directory* target) : target_(target) {} + + Status Fsync() override { return target_->Fsync(); } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + Directory* target_; +}; + +class LoggerWrapper : public Logger { + public: + explicit LoggerWrapper(Logger* target) : target_(target) {} + + Status Close() override { return target_->Close(); } + void LogHeader(const char* format, va_list ap) override { + return target_->LogHeader(format, ap); + } + void Logv(const char* format, va_list ap) override { + return target_->Logv(format, ap); + } + void Logv(const InfoLogLevel log_level, const char* format, + va_list ap) override { + return target_->Logv(log_level, format, ap); + } + size_t GetLogFileSize() const override { return target_->GetLogFileSize(); } + void Flush() override { return target_->Flush(); } + InfoLogLevel GetInfoLogLevel() const override { + return target_->GetInfoLogLevel(); + } + void SetInfoLogLevel(const InfoLogLevel log_level) override { + return target_->SetInfoLogLevel(log_level); + } + + private: + Logger* target_; +}; + +// Returns a new environment that stores its data in memory and delegates +// all non-file-storage tasks to base_env. The caller must delete the result +// when it is no longer needed. +// *base_env must remain live while the result is in use. +Env* NewMemEnv(Env* base_env); + +// Returns a new environment that is used for HDFS environment. +// This is a factory method for HdfsEnv declared in hdfs/env_hdfs.h +Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname); + +// Returns a new environment that measures function call times for filesystem +// operations, reporting results to variables in PerfContext. +// This is a factory method for TimedEnv defined in utilities/env_timed.cc. +Env* NewTimedEnv(Env* base_env); + +// Returns an instance of logger that can be used for storing informational +// messages. +// This is a factory method for EnvLogger declared in logging/env_logging.h +Status NewEnvLogger(const std::string& fname, Env* env, + std::shared_ptr<Logger>* result); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/env_encryption.h b/src/rocksdb/include/rocksdb/env_encryption.h new file mode 100644 index 000000000..a4db10fd0 --- /dev/null +++ b/src/rocksdb/include/rocksdb/env_encryption.h @@ -0,0 +1,206 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#if !defined(ROCKSDB_LITE) + +#include <string> + +#include "env.h" + +namespace ROCKSDB_NAMESPACE { + +class EncryptionProvider; + +// Returns an Env that encrypts data when stored on disk and decrypts data when +// read from disk. +Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider); + +// BlockAccessCipherStream is the base class for any cipher stream that +// supports random access at block level (without requiring data from other +// blocks). E.g. CTR (Counter operation mode) supports this requirement. +class BlockAccessCipherStream { + public: + virtual ~BlockAccessCipherStream(){}; + + // BlockSize returns the size of each block supported by this cipher stream. + virtual size_t BlockSize() = 0; + + // Encrypt one or more (partial) blocks of data at the file offset. + // Length of data is given in dataSize. + virtual Status Encrypt(uint64_t fileOffset, char* data, size_t dataSize); + + // Decrypt one or more (partial) blocks of data at the file offset. + // Length of data is given in dataSize. + virtual Status Decrypt(uint64_t fileOffset, char* data, size_t dataSize); + + protected: + // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. + virtual void AllocateScratch(std::string&) = 0; + + // Encrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status EncryptBlock(uint64_t blockIndex, char* data, + char* scratch) = 0; + + // Decrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status DecryptBlock(uint64_t blockIndex, char* data, + char* scratch) = 0; +}; + +// BlockCipher +class BlockCipher { + public: + virtual ~BlockCipher(){}; + + // BlockSize returns the size of each block supported by this cipher stream. + virtual size_t BlockSize() = 0; + + // Encrypt a block of data. + // Length of data is equal to BlockSize(). + virtual Status Encrypt(char* data) = 0; + + // Decrypt a block of data. + // Length of data is equal to BlockSize(). + virtual Status Decrypt(char* data) = 0; +}; + +// Implements a BlockCipher using ROT13. +// +// Note: This is a sample implementation of BlockCipher, +// it is NOT considered safe and should NOT be used in production. +class ROT13BlockCipher : public BlockCipher { + private: + size_t blockSize_; + + public: + ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) {} + virtual ~ROT13BlockCipher(){}; + + // BlockSize returns the size of each block supported by this cipher stream. + virtual size_t BlockSize() override { return blockSize_; } + + // Encrypt a block of data. + // Length of data is equal to BlockSize(). + virtual Status Encrypt(char* data) override; + + // Decrypt a block of data. + // Length of data is equal to BlockSize(). + virtual Status Decrypt(char* data) override; +}; + +// CTRCipherStream implements BlockAccessCipherStream using an +// Counter operations mode. +// See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation +// +// Note: This is a possible implementation of BlockAccessCipherStream, +// it is considered suitable for use. +class CTRCipherStream final : public BlockAccessCipherStream { + private: + BlockCipher& cipher_; + std::string iv_; + uint64_t initialCounter_; + + public: + CTRCipherStream(BlockCipher& c, const char* iv, uint64_t initialCounter) + : cipher_(c), iv_(iv, c.BlockSize()), initialCounter_(initialCounter){}; + virtual ~CTRCipherStream(){}; + + // BlockSize returns the size of each block supported by this cipher stream. + virtual size_t BlockSize() override { return cipher_.BlockSize(); } + + protected: + // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. + virtual void AllocateScratch(std::string&) override; + + // Encrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status EncryptBlock(uint64_t blockIndex, char* data, + char* scratch) override; + + // Decrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status DecryptBlock(uint64_t blockIndex, char* data, + char* scratch) override; +}; + +// The encryption provider is used to create a cipher stream for a specific +// file. The returned cipher stream will be used for actual +// encryption/decryption actions. +class EncryptionProvider { + public: + virtual ~EncryptionProvider(){}; + + // GetPrefixLength returns the length of the prefix that is added to every + // file and used for storing encryption options. For optimal performance, the + // prefix length should be a multiple of the page size. + virtual size_t GetPrefixLength() = 0; + + // CreateNewPrefix initialized an allocated block of prefix memory + // for a new file. + virtual Status CreateNewPrefix(const std::string& fname, char* prefix, + size_t prefixLength) = 0; + + // CreateCipherStream creates a block access cipher stream for a file given + // given name and options. + virtual Status CreateCipherStream( + const std::string& fname, const EnvOptions& options, Slice& prefix, + std::unique_ptr<BlockAccessCipherStream>* result) = 0; +}; + +// This encryption provider uses a CTR cipher stream, with a given block cipher +// and IV. +// +// Note: This is a possible implementation of EncryptionProvider, +// it is considered suitable for use, provided a safe BlockCipher is used. +class CTREncryptionProvider : public EncryptionProvider { + private: + BlockCipher& cipher_; + + protected: + const static size_t defaultPrefixLength = 4096; + + public: + CTREncryptionProvider(BlockCipher& c) : cipher_(c){}; + virtual ~CTREncryptionProvider() {} + + // GetPrefixLength returns the length of the prefix that is added to every + // file and used for storing encryption options. For optimal performance, the + // prefix length should be a multiple of the page size. + virtual size_t GetPrefixLength() override; + + // CreateNewPrefix initialized an allocated block of prefix memory + // for a new file. + virtual Status CreateNewPrefix(const std::string& fname, char* prefix, + size_t prefixLength) override; + + // CreateCipherStream creates a block access cipher stream for a file given + // given name and options. + virtual Status CreateCipherStream( + const std::string& fname, const EnvOptions& options, Slice& prefix, + std::unique_ptr<BlockAccessCipherStream>* result) override; + + protected: + // PopulateSecretPrefixPart initializes the data into a new prefix block + // that will be encrypted. This function will store the data in plain text. + // It will be encrypted later (before written to disk). + // Returns the amount of space (starting from the start of the prefix) + // that has been initialized. + virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength, + size_t blockSize); + + // CreateCipherStreamFromPrefix creates a block access cipher stream for a + // file given given name and options. The given prefix is already decrypted. + virtual Status CreateCipherStreamFromPrefix( + const std::string& fname, const EnvOptions& options, + uint64_t initialCounter, const Slice& iv, const Slice& prefix, + std::unique_ptr<BlockAccessCipherStream>* result); +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) diff --git a/src/rocksdb/include/rocksdb/experimental.h b/src/rocksdb/include/rocksdb/experimental.h new file mode 100644 index 000000000..f26d6371c --- /dev/null +++ b/src/rocksdb/include/rocksdb/experimental.h @@ -0,0 +1,29 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/db.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +namespace experimental { + +// Supported only for Leveled compaction +Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end); +Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end); + +// Move all L0 files to target_level skipping compaction. +// This operation succeeds only if the files in L0 have disjoint ranges; this +// is guaranteed to happen, for instance, if keys are inserted in sorted +// order. Furthermore, all levels between 1 and target_level must be empty. +// If any of the above condition is violated, InvalidArgument will be +// returned. +Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, + int target_level = 1); + +} // namespace experimental +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/file_checksum.h b/src/rocksdb/include/rocksdb/file_checksum.h new file mode 100644 index 000000000..35f54f40b --- /dev/null +++ b/src/rocksdb/include/rocksdb/file_checksum.h @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2013 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <cassert> +#include <map> +#include <memory> +#include <string> +#include <vector> + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// FileChecksumFunc is the function class to generates the checksum value +// for each file when the file is written to the file system. +class FileChecksumFunc { + public: + virtual ~FileChecksumFunc() {} + // Return the checksum of concat (A, data[0,n-1]) where init_checksum is the + // returned value of some string A. It is used to maintain the checksum of a + // stream of data + virtual std::string Extend(const std::string& init_checksum, const char* data, + size_t n) = 0; + + // Return the checksum value of data[0,n-1] + virtual std::string Value(const char* data, size_t n) = 0; + + // Return a processed value of the checksum for store in somewhere + virtual std::string ProcessChecksum(const std::string& checksum) = 0; + + // Returns a name that identifies the current file checksum function. + virtual const char* Name() const = 0; +}; + +// FileChecksumList stores the checksum information of a list of files (e.g., +// SST files). The FileChecksumLIst can be used to store the checksum +// information of all SST file getting from the MANIFEST, which are +// the checksum information of all valid SST file of a DB instance. It can +// also be used to store the checksum information of a list of SST files to +// be ingested. +class FileChecksumList { + public: + virtual ~FileChecksumList() {} + + // Clean the previously stored file checksum information. + virtual void reset() = 0; + + // Get the number of checksums in the checksum list + virtual size_t size() const = 0; + + // Return all the file checksum information being stored in a unordered_map. + // File_number is the key, the first part of the value is checksum value, + // and the second part of the value is checksum function name. + virtual Status GetAllFileChecksums( + std::vector<uint64_t>* file_numbers, std::vector<std::string>* checksums, + std::vector<std::string>* checksum_func_names) = 0; + + // Given the file_number, it searches if the file checksum information is + // stored. + virtual Status SearchOneFileChecksum(uint64_t file_number, + std::string* checksum, + std::string* checksum_func_name) = 0; + + // Insert the checksum information of one file to the FileChecksumList. + virtual Status InsertOneFileChecksum( + uint64_t file_number, const std::string& checksum, + const std::string& checksum_func_name) = 0; + + // Remove the checksum information of one SST file. + virtual Status RemoveOneFileChecksum(uint64_t file_number) = 0; +}; + +// Create a new file checksum list. +extern FileChecksumList* NewFileChecksumList(); + +// Create a Crc32c based file checksum function +extern FileChecksumFunc* CreateFileChecksumFuncCrc32c(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/file_system.h b/src/rocksdb/include/rocksdb/file_system.h new file mode 100644 index 000000000..c1fd919f3 --- /dev/null +++ b/src/rocksdb/include/rocksdb/file_system.h @@ -0,0 +1,1358 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// A FileSystem is an interface used by the rocksdb implementation to access +// storage functionality like the filesystem etc. Callers +// may wish to provide a custom FileSystem object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All FileSystem implementations are safe for concurrent access from +// multiple threads without any external synchronization. +// +// WARNING: Since this is a new interface, it is expected that there will be +// some changes as storage systems are ported over. + +#pragma once + +#include <stdint.h> +#include <chrono> +#include <cstdarg> +#include <functional> +#include <limits> +#include <memory> +#include <sstream> +#include <string> +#include <vector> +#include "rocksdb/env.h" +#include "rocksdb/io_status.h" +#include "rocksdb/options.h" +#include "rocksdb/thread_status.h" + +namespace ROCKSDB_NAMESPACE { + +class FileLock; +class FSDirectory; +class FSRandomAccessFile; +class FSRandomRWFile; +class FSSequentialFile; +class FSWritableFile; +class Logger; +class Slice; +struct ImmutableDBOptions; +struct MutableDBOptions; +class RateLimiter; + +using AccessPattern = RandomAccessFile::AccessPattern; +using FileAttributes = Env::FileAttributes; + +// Priority of an IO request. This is a hint and does not guarantee any +// particular QoS. +// IO_LOW - Typically background reads/writes such as compaction/flush +// IO_HIGH - Typically user reads/synchronous WAL writes +enum class IOPriority : uint8_t { + kIOLow, + kIOHigh, + kIOTotal, +}; + +// Type of the data begin read/written. It can be passed down as a flag +// for the FileSystem implementation to optionally handle different types in +// different ways +enum class IOType : uint8_t { + kData, + kFilter, + kIndex, + kMetadata, + kWAL, + kManifest, + kLog, + kUnknown, + kInvalid, +}; + +// Per-request options that can be passed down to the FileSystem +// implementation. These are hints and are not necessarily guaranteed to be +// honored. More hints can be added here in the future to indicate things like +// storage media (HDD/SSD) to be used, replication level etc. +struct IOOptions { + // Timeout for the operation in milliseconds + std::chrono::milliseconds timeout; + + // Priority - high or low + IOPriority prio; + + // Type of data being read/written + IOType type; +}; + +// File scope options that control how a file is opened/created and accessed +// while its open. We may add more options here in the future such as +// redundancy level, media to use etc. +struct FileOptions : EnvOptions { + // Embedded IOOptions to control the parameters for any IOs that need + // to be issued for the file open/creation + IOOptions io_options; + + FileOptions() : EnvOptions() {} + + FileOptions(const DBOptions& opts) + : EnvOptions(opts) {} + + FileOptions(const EnvOptions& opts) + : EnvOptions(opts) {} +}; + +// A structure to pass back some debugging information from the FileSystem +// implementation to RocksDB in case of an IO error +struct IODebugContext { + // file_path to be filled in by RocksDB in case of an error + std::string file_path; + + // A map of counter names to values - set by the FileSystem implementation + std::map<std::string, uint64_t> counters; + + // To be set by the FileSystem implementation + std::string msg; + + IODebugContext() {} + + void AddCounter(std::string& name, uint64_t value) { + counters.emplace(name, value); + } + + std::string ToString() { + std::ostringstream ss; + ss << file_path << ", "; + for (auto counter : counters) { + ss << counter.first << " = " << counter.second << ","; + } + ss << msg; + return ss.str(); + } +}; + +// The FileSystem, FSSequentialFile, FSRandomAccessFile, FSWritableFile, +// FSRandomRWFileclass, and FSDIrectory classes define the interface between +// RocksDB and storage systems, such as Posix filesystems, +// remote filesystems etc. +// The interface allows for fine grained control of individual IO operations, +// such as setting a timeout, prioritization, hints on data placement, +// different handling based on type of IO etc. +// This is accomplished by passing an instance of IOOptions to every +// API call that can potentially perform IO. Additionally, each such API is +// passed a pointer to a IODebugContext structure that can be used by the +// storage system to include troubleshooting information. The return values +// of the APIs is of type IOStatus, which can indicate an error code/sub-code, +// as well as metadata about the error such as its scope and whether its +// retryable. +class FileSystem { + public: + FileSystem(); + + // No copying allowed + FileSystem(const FileSystem&) = delete; + + virtual ~FileSystem(); + + virtual const char* Name() const = 0; + + static const char* Type() { return "FileSystem"; } + + // Loads the FileSystem specified by the input value into the result + static Status Load(const std::string& value, + std::shared_ptr<FileSystem>* result); + + // Return a default fie_system suitable for the current operating + // system. Sophisticated users may wish to provide their own Env + // implementation instead of relying on this default file_system + // + // The result of Default() belongs to rocksdb and must never be deleted. + static std::shared_ptr<FileSystem> Default(); + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores nullptr in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSSequentialFile>* result, + IODebugContext* dbg) = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual IOStatus NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr<FSRandomAccessFile>* result, + IODebugContext* dbg) = 0; + // These values match Linux definition + // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 + enum WriteLifeTimeHint { + kWLTHNotSet = 0, // No hint information set + kWLTHNone, // No hints about write life time + kWLTHShort, // Data written has a short life time + kWLTHMedium, // Data written has a medium life time + kWLTHLong, // Data written has a long life time + kWLTHExtreme, // Data written has an extremely long life time + }; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) = 0; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual IOStatus ReopenWritableFile( + const std::string& /*fname*/, const FileOptions& /*options*/, + std::unique_ptr<FSWritableFile>* /*result*/, IODebugContext* /*dbg*/) { + return IOStatus::NotSupported(); + } + + // Reuse an existing file by renaming it and opening it as writable. + virtual IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) = 0; + + // Open `fname` for random read and write, if file doesn't exist the file + // will be created. On success, stores a pointer to the new file in + // *result and returns OK. On failure returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual IOStatus NewRandomRWFile(const std::string& /*fname*/, + const FileOptions& /*options*/, + std::unique_ptr<FSRandomRWFile>* /*result*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported( + "RandomRWFile is not implemented in this FileSystem"); + } + + // Opens `fname` as a memory-mapped file for read and write (in-place updates + // only, i.e., no appends). On success, stores a raw buffer covering the whole + // file in `*result`. The file must exist prior to this call. + virtual IOStatus NewMemoryMappedFileBuffer( + const std::string& /*fname*/, + std::unique_ptr<MemoryMappedFileBuffer>* /*result*/) { + return IOStatus::NotSupported( + "MemoryMappedFileBuffer is not implemented in this FileSystem"); + } + + // Create an object that represents a directory. Will fail if directory + // doesn't exist. If the directory exists, it will open the directory + // and create a new Directory object. + // + // On success, stores a pointer to the new Directory in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + virtual IOStatus NewDirectory(const std::string& name, + const IOOptions& io_opts, + std::unique_ptr<FSDirectory>* result, + IODebugContext* dbg) = 0; + + // Returns OK if the named file exists. + // NotFound if the named file does not exist, + // the calling process does not have permission to determine + // whether this file exists, or if the path is invalid. + // IOError if an IO Error was encountered + virtual IOStatus FileExists(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + virtual IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector<std::string>* result, + IODebugContext* dbg) = 0; + + // Store in *result the attributes of the children of the specified directory. + // In case the implementation lists the directory prior to iterating the files + // and files are concurrently deleted, the deleted files will be omitted from + // result. + // The name attributes are relative to "dir". + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + virtual IOStatus GetChildrenFileAttributes( + const std::string& dir, const IOOptions& options, + std::vector<FileAttributes>* result, IODebugContext* dbg) { + assert(result != nullptr); + std::vector<std::string> child_fnames; + IOStatus s = GetChildren(dir, options, &child_fnames, dbg); + if (!s.ok()) { + return s; + } + result->resize(child_fnames.size()); + size_t result_size = 0; + for (size_t i = 0; i < child_fnames.size(); ++i) { + const std::string path = dir + "/" + child_fnames[i]; + if (!(s = GetFileSize(path, options, &(*result)[result_size].size_bytes, + dbg)) + .ok()) { + if (FileExists(path, options, dbg).IsNotFound()) { + // The file may have been deleted since we listed the directory + continue; + } + return s; + } + (*result)[result_size].name = std::move(child_fnames[i]); + result_size++; + } + result->resize(result_size); + return IOStatus::OK(); + } + + // Delete the named file. + virtual IOStatus DeleteFile(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) = 0; + + // Truncate the named file to the specified size. + virtual IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("Truncate is not supported for this FileSystem"); + } + + // Create the specified directory. Returns error if directory exists. + virtual IOStatus CreateDir(const std::string& dirname, + const IOOptions& options, IODebugContext* dbg) = 0; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + virtual IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) = 0; + + // Delete the specified directory. + virtual IOStatus DeleteDir(const std::string& dirname, + const IOOptions& options, IODebugContext* dbg) = 0; + + // Store the size of fname in *file_size. + virtual IOStatus GetFileSize(const std::string& fname, + const IOOptions& options, uint64_t* file_size, + IODebugContext* dbg) = 0; + + // Store the last modification time of fname in *file_mtime. + virtual IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) = 0; + // Rename file src to target. + virtual IOStatus RenameFile(const std::string& src, const std::string& target, + const IOOptions& options, + IODebugContext* dbg) = 0; + + // Hard Link file src to target. + virtual IOStatus LinkFile(const std::string& /*src*/, + const std::string& /*target*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("LinkFile is not supported for this FileSystem"); + } + + virtual IOStatus NumFileLinks(const std::string& /*fname*/, + const IOOptions& /*options*/, + uint64_t* /*count*/, IODebugContext* /*dbg*/) { + return IOStatus::NotSupported( + "Getting number of file links is not supported for this FileSystem"); + } + + virtual IOStatus AreFilesSame(const std::string& /*first*/, + const std::string& /*second*/, + const IOOptions& /*options*/, bool* /*res*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("AreFilesSame is not supported for this FileSystem"); + } + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores nullptr in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual IOStatus UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) = 0; + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) = 0; + + // Create and returns a default logger (an instance of EnvLogger) for storing + // informational messages. Derived classes can overide to provide custom + // logger. + virtual IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts, + std::shared_ptr<Logger>* result, + IODebugContext* dbg) = 0; + + // Get full directory name for this db. + virtual IOStatus GetAbsolutePath(const std::string& db_path, + const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) = 0; + + // OptimizeForLogRead will create a new FileOptions object that is a copy of + // the FileOptions in the parameters, but is optimized for reading log files. + virtual FileOptions OptimizeForLogRead(const FileOptions& file_options) const; + + // OptimizeForManifestRead will create a new FileOptions object that is a copy + // of the FileOptions in the parameters, but is optimized for reading manifest + // files. + virtual FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const; + + // OptimizeForLogWrite will create a new FileOptions object that is a copy of + // the FileOptions in the parameters, but is optimized for writing log files. + // Default implementation returns the copy of the same object. + virtual FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const; + + // OptimizeForManifestWrite will create a new FileOptions object that is a + // copy of the FileOptions in the parameters, but is optimized for writing + // manifest files. Default implementation returns the copy of the same + // object. + virtual FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const; + + // OptimizeForCompactionTableWrite will create a new FileOptions object that + // is a copy of the FileOptions in the parameters, but is optimized for + // writing table files. + virtual FileOptions OptimizeForCompactionTableWrite( + const FileOptions& file_options, + const ImmutableDBOptions& immutable_ops) const; + + // OptimizeForCompactionTableRead will create a new FileOptions object that + // is a copy of the FileOptions in the parameters, but is optimized for + // reading table files. + virtual FileOptions OptimizeForCompactionTableRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const; + +// This seems to clash with a macro on Windows, so #undef it here +#ifdef GetFreeSpace +#undef GetFreeSpace +#endif + + // Get the amount of free disk space + virtual IOStatus GetFreeSpace(const std::string& /*path*/, + const IOOptions& /*options*/, + uint64_t* /*diskfree*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported(); + } + + // If you're adding methods here, remember to add them to EnvWrapper too. + + private: + void operator=(const FileSystem&); +}; + +// A file abstraction for reading sequentially through a file +class FSSequentialFile { + public: + FSSequentialFile() {} + + virtual ~FSSequentialFile() {} + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + virtual IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) = 0; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + virtual IOStatus Skip(uint64_t n) = 0; + + // Indicates the upper layers if the current SequentialFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return IOStatus::NotSupported("InvalidateCache not supported."); + } + + // Positioned Read for direct I/O + // If Direct I/O enabled, offset, n, and scratch should be properly aligned + virtual IOStatus PositionedRead(uint64_t /*offset*/, size_t /*n*/, + const IOOptions& /*options*/, + Slice* /*result*/, char* /*scratch*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported(); + } + + // If you're adding methods here, remember to add them to + // SequentialFileWrapper too. +}; + +// A read IO request structure for use in MultiRead +struct FSReadRequest { + // File offset in bytes + uint64_t offset; + + // Length to read in bytes + size_t len; + + // A buffer that MultiRead() can optionally place data in. It can + // ignore this and allocate its own buffer + char* scratch; + + // Output parameter set by MultiRead() to point to the data buffer, and + // the number of valid bytes + Slice result; + + // Status of read + IOStatus status; +}; + +// A file abstraction for randomly reading the contents of a file. +class FSRandomAccessFile { + public: + FSRandomAccessFile() {} + + virtual ~FSRandomAccessFile() {} + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + // If Direct I/O enabled, offset, n, and scratch should be aligned properly. + virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const = 0; + + // Readahead the file starting from offset by n bytes for caching. + virtual IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); + } + + // Read a bunch of blocks as described by reqs. The blocks can + // optionally be read in parallel. This is a synchronous call, i.e it + // should return after all reads have completed. The reads will be + // non-overlapping. If the function return Status is not ok, status of + // individual requests will be ignored and return status will be assumed + // for all read requests. The function return status is only meant for any + // any errors that occur before even processing specific read requests + virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) { + assert(reqs != nullptr); + for (size_t i = 0; i < num_reqs; ++i) { + FSReadRequest& req = reqs[i]; + req.status = + Read(req.offset, req.len, options, &req.result, req.scratch, dbg); + } + return IOStatus::OK(); + } + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to each other by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; // Default implementation to prevent issues with backwards + // compatibility. + }; + + enum AccessPattern { kNormal, kRandom, kSequential, kWillNeed, kWontNeed }; + + virtual void Hint(AccessPattern /*pattern*/) {} + + // Indicates the upper layers if the current RandomAccessFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return IOStatus::NotSupported("InvalidateCache not supported."); + } + + // If you're adding methods here, remember to add them to + // RandomAccessFileWrapper too. +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class FSWritableFile { + public: + FSWritableFile() + : last_preallocated_block_(0), + preallocation_block_size_(0), + io_priority_(Env::IO_TOTAL), + write_hint_(Env::WLTH_NOT_SET), + strict_bytes_per_sync_(false) {} + + explicit FSWritableFile(const FileOptions& options) + : last_preallocated_block_(0), + preallocation_block_size_(0), + io_priority_(Env::IO_TOTAL), + write_hint_(Env::WLTH_NOT_SET), + strict_bytes_per_sync_(options.strict_bytes_per_sync) {} + + virtual ~FSWritableFile() {} + + // Append data to the end of the file + // Note: A WriteabelFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. + virtual IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) = 0; + + // PositionedAppend data to the specified offset. The new EOF after append + // must be larger than the previous EOF. This is to be used when writes are + // not backed by OS buffers and hence has to always start from the start of + // the sector. The implementation thus needs to also rewrite the last + // partial sector. + // Note: PositionAppend does not guarantee moving the file offset after the + // write. A WritableFile object must support either Append or + // PositionedAppend, so the users cannot mix the two. + // + // PositionedAppend() can only happen on the page/sector boundaries. For that + // reason, if the last write was an incomplete sector we still need to rewind + // back to the nearest sector/page and rewrite the portion of it with whatever + // we need to add. We need to keep where we stop writing. + // + // PositionedAppend() can only write whole sectors. For that reason we have to + // pad with zeros for the last write and trim the file when closing according + // to the position we keep in the previous step. + // + // PositionedAppend() requires aligned buffer to be passed in. The alignment + // required is queried via GetRequiredBufferAlignment() + virtual IOStatus PositionedAppend(const Slice& /* data */, + uint64_t /* offset */, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported(); + } + + // Truncate is necessary to trim the file to the correct size + // before closing. It is not always possible to keep track of the file + // size due to whole pages writes. The behavior is undefined if called + // with other writes to follow. + virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); + } + virtual IOStatus Close(const IOOptions& options, IODebugContext* dbg) = 0; + virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0; + virtual IOStatus Sync(const IOOptions& options, + IODebugContext* dbg) = 0; // sync data + + /* + * Sync data and/or metadata as well. + * By default, sync only data. + * Override this method for environments where we need to sync + * metadata as well. + */ + virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) { + return Sync(options, dbg); + } + + // true if Sync() and Fsync() are safe to call concurrently with Append() + // and Flush(). + virtual bool IsSyncThreadSafe() const { return false; } + + // Indicates the upper layers if the current WritableFile implementation + // uses direct IO. + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { + write_hint_ = hint; + } + + virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; } + + virtual Env::IOPriority GetIOPriority() { return io_priority_; } + + virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; } + /* + * Get the size of valid data in the file. + */ + virtual uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return 0; + } + + /* + * Get and set the default pre-allocation block size for writes to + * this file. If non-zero, then Allocate will be used to extend the + * underlying storage of a file (generally via fallocate) if the Env + * instance supports it. + */ + virtual void SetPreallocationBlockSize(size_t size) { + preallocation_block_size_ = size; + } + + virtual void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) { + *last_allocated_block = last_preallocated_block_; + *block_size = preallocation_block_size_; + } + + // For documentation, refer to RandomAccessFile::GetUniqueId() + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; // Default implementation to prevent issues with backwards + } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) { + return IOStatus::NotSupported("InvalidateCache not supported."); + } + + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + virtual IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/, + const IOOptions& options, IODebugContext* dbg) { + if (strict_bytes_per_sync_) { + return Sync(options, dbg); + } + return IOStatus::OK(); + } + + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + virtual void PrepareWrite(size_t offset, size_t len, const IOOptions& options, + IODebugContext* dbg) { + if (preallocation_block_size_ == 0) { + return; + } + // If this write would cross one or more preallocation blocks, + // determine what the last preallocation block necessary to + // cover this write would be and Allocate to that point. + const auto block_size = preallocation_block_size_; + size_t new_last_preallocated_block = + (offset + len + block_size - 1) / block_size; + if (new_last_preallocated_block > last_preallocated_block_) { + size_t num_spanned_blocks = + new_last_preallocated_block - last_preallocated_block_; + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks, options, dbg); + last_preallocated_block_ = new_last_preallocated_block; + } + } + + // Pre-allocates space for a file. + virtual IOStatus Allocate(uint64_t /*offset*/, uint64_t /*len*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); + } + + // If you're adding methods here, remember to add them to + // WritableFileWrapper too. + + protected: + size_t preallocation_block_size() { return preallocation_block_size_; } + + private: + size_t last_preallocated_block_; + size_t preallocation_block_size_; + // No copying allowed + FSWritableFile(const FSWritableFile&); + void operator=(const FSWritableFile&); + + protected: + Env::IOPriority io_priority_; + Env::WriteLifeTimeHint write_hint_; + const bool strict_bytes_per_sync_; +}; + +// A file abstraction for random reading and writing. +class FSRandomRWFile { + public: + FSRandomRWFile() {} + + virtual ~FSRandomRWFile() {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + virtual bool use_direct_io() const { return false; } + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + virtual IOStatus Write(uint64_t offset, const Slice& data, + const IOOptions& options, IODebugContext* dbg) = 0; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // Returns Status::OK() on success. + virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const = 0; + + virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0; + + virtual IOStatus Sync(const IOOptions& options, IODebugContext* dbg) = 0; + + virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) { + return Sync(options, dbg); + } + + virtual IOStatus Close(const IOOptions& options, IODebugContext* dbg) = 0; + + // If you're adding methods here, remember to add them to + // RandomRWFileWrapper too. + + // No copying allowed + FSRandomRWFile(const RandomRWFile&) = delete; + FSRandomRWFile& operator=(const RandomRWFile&) = delete; +}; + +// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer. +// Subclasses should release the mapping upon destruction. +class FSMemoryMappedFileBuffer { + public: + FSMemoryMappedFileBuffer(void* _base, size_t _length) + : base_(_base), length_(_length) {} + + virtual ~FSMemoryMappedFileBuffer() = 0; + + // We do not want to unmap this twice. We can make this class + // movable if desired, however, since + FSMemoryMappedFileBuffer(const FSMemoryMappedFileBuffer&) = delete; + FSMemoryMappedFileBuffer& operator=(const FSMemoryMappedFileBuffer&) = delete; + + void* GetBase() const { return base_; } + size_t GetLen() const { return length_; } + + protected: + void* base_; + const size_t length_; +}; + +// Directory object represents collection of files and implements +// filesystem operations that can be executed on directories. +class FSDirectory { + public: + virtual ~FSDirectory() {} + // Fsync directory. Can be called concurrently from multiple threads. + virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) = 0; + + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { + return 0; + } + + // If you're adding methods here, remember to add them to + // DirectoryWrapper too. +}; + +// Below are helpers for wrapping most of the classes in this file. +// They forward all calls to another instance of the class. +// Useful when wrapping the default implementations. +// Typical usage is to inherit your wrapper from *Wrapper, e.g.: +// +// class MySequentialFileWrapper : public +// ROCKSDB_NAMESPACE::FSSequentialFileWrapper { +// public: +// MySequentialFileWrapper(ROCKSDB_NAMESPACE::FSSequentialFile* target): +// ROCKSDB_NAMESPACE::FSSequentialFileWrapper(target) {} +// Status Read(size_t n, FileSystem::IOOptions& options, Slice* result, +// char* scratch, FileSystem::IODebugContext* dbg) override { +// cout << "Doing a read of size " << n << "!" << endl; +// return ROCKSDB_NAMESPACE::FSSequentialFileWrapper::Read(n, options, +// result, +// scratch, dbg); +// } +// // All other methods are forwarded to target_ automatically. +// }; +// +// This is often more convenient than inheriting the class directly because +// (a) Don't have to override and forward all methods - the Wrapper will +// forward everything you're not explicitly overriding. +// (b) Don't need to update the wrapper when more methods are added to the +// rocksdb class. Unless you actually want to override the behavior. +// (And unless rocksdb people forgot to update the *Wrapper class.) + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class FileSystemWrapper : public FileSystem { + public: + // Initialize an EnvWrapper that delegates all calls to *t + explicit FileSystemWrapper(FileSystem* t) : target_(t) {} + ~FileSystemWrapper() override {} + + // Return the target to which this Env forwards all calls + FileSystem* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + IOStatus NewSequentialFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr<FSSequentialFile>* r, + IODebugContext* dbg) override { + return target_->NewSequentialFile(f, file_opts, r, dbg); + } + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr<FSRandomAccessFile>* r, + IODebugContext* dbg) override { + return target_->NewRandomAccessFile(f, file_opts, r, dbg); + } + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* r, + IODebugContext* dbg) override { + return target_->NewWritableFile(f, file_opts, r, dbg); + } + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override { + return target_->ReopenWritableFile(fname, file_opts, result, dbg); + } + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* r, + IODebugContext* dbg) override { + return target_->ReuseWritableFile(fname, old_fname, file_opts, r, + dbg); + } + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSRandomRWFile>* result, + IODebugContext* dbg) override { + return target_->NewRandomRWFile(fname, file_opts, result, dbg); + } + IOStatus NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr<MemoryMappedFileBuffer>* result) override { + return target_->NewMemoryMappedFileBuffer(fname, result); + } + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, + std::unique_ptr<FSDirectory>* result, + IODebugContext* dbg) override { + return target_->NewDirectory(name, io_opts, result, dbg); + } + IOStatus FileExists(const std::string& f, const IOOptions& io_opts, + IODebugContext* dbg) override { + return target_->FileExists(f, io_opts, dbg); + } + IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts, + std::vector<std::string>* r, + IODebugContext* dbg) override { + return target_->GetChildren(dir, io_opts, r, dbg); + } + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector<FileAttributes>* result, + IODebugContext* dbg) override { + return target_->GetChildrenFileAttributes(dir, options, result, dbg); + } + IOStatus DeleteFile(const std::string& f, const IOOptions& options, + IODebugContext* dbg) override { + return target_->DeleteFile(f, options, dbg); + } + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& options, IODebugContext* dbg) override { + return target_->Truncate(fname, size, options, dbg); + } + IOStatus CreateDir(const std::string& d, const IOOptions& options, + IODebugContext* dbg) override { + return target_->CreateDir(d, options, dbg); + } + IOStatus CreateDirIfMissing(const std::string& d, const IOOptions& options, + IODebugContext* dbg) override { + return target_->CreateDirIfMissing(d, options, dbg); + } + IOStatus DeleteDir(const std::string& d, const IOOptions& options, + IODebugContext* dbg) override { + return target_->DeleteDir(d, options, dbg); + } + IOStatus GetFileSize(const std::string& f, const IOOptions& options, + uint64_t* s, IODebugContext* dbg) override { + return target_->GetFileSize(f, options, s, dbg); + } + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override { + return target_->GetFileModificationTime(fname, options, file_mtime, dbg); + } + + IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) override { + return target_->GetAbsolutePath(db_path, options, output_path, dbg); + } + + IOStatus RenameFile(const std::string& s, const std::string& t, + const IOOptions& options, IODebugContext* dbg) override { + return target_->RenameFile(s, t, options, dbg); + } + + IOStatus LinkFile(const std::string& s, const std::string& t, + const IOOptions& options, IODebugContext* dbg) override { + return target_->LinkFile(s, t, options, dbg); + } + + IOStatus NumFileLinks(const std::string& fname, const IOOptions& options, + uint64_t* count, IODebugContext* dbg) override { + return target_->NumFileLinks(fname, options, count, dbg); + } + + IOStatus AreFilesSame(const std::string& first, const std::string& second, + const IOOptions& options, bool* res, + IODebugContext* dbg) override { + return target_->AreFilesSame(first, second, options, res, dbg); + } + + IOStatus LockFile(const std::string& f, const IOOptions& options, + FileLock** l, IODebugContext* dbg) override { + return target_->LockFile(f, options, l, dbg); + } + + IOStatus UnlockFile(FileLock* l, const IOOptions& options, + IODebugContext* dbg) override { + return target_->UnlockFile(l, options, dbg); + } + + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override { + return target_->GetTestDirectory(options, path, dbg); + } + IOStatus NewLogger(const std::string& fname, const IOOptions& options, + std::shared_ptr<Logger>* result, + IODebugContext* dbg) override { + return target_->NewLogger(fname, options, result, dbg); + } + + FileOptions OptimizeForLogRead( + const FileOptions& file_options) const override { + return target_->OptimizeForLogRead(file_options); + } + FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestRead(file_options); + } + FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const override { + return target_->OptimizeForLogWrite(file_options, db_options); + } + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestWrite(file_options); + } + FileOptions OptimizeForCompactionTableWrite( + const FileOptions& file_options, + const ImmutableDBOptions& immutable_ops) const override { + return target_->OptimizeForCompactionTableWrite(file_options, + immutable_ops); + } + FileOptions OptimizeForCompactionTableRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForCompactionTableRead(file_options, db_options); + } + IOStatus GetFreeSpace(const std::string& path, const IOOptions& options, + uint64_t* diskfree, IODebugContext* dbg) override { + return target_->GetFreeSpace(path, options, diskfree, dbg); + } + + private: + FileSystem* target_; +}; + +class FSSequentialFileWrapper : public FSSequentialFile { + public: + explicit FSSequentialFileWrapper(FSSequentialFile* target) + : target_(target) {} + + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override { + return target_->Read(n, options, result, scratch, dbg); + } + IOStatus Skip(uint64_t n) override { return target_->Skip(n); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override { + return target_->PositionedRead(offset, n, options, result, scratch, dbg); + } + + private: + FSSequentialFile* target_; +}; + +class FSRandomAccessFileWrapper : public FSRandomAccessFile { + public: + explicit FSRandomAccessFileWrapper(FSRandomAccessFile* target) + : target_(target) {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + return target_->Read(offset, n, options, result, scratch, dbg); + } + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override { + return target_->MultiRead(reqs, num_reqs, options, dbg); + } + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Prefetch(offset, n, options, dbg); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + }; + void Hint(AccessPattern pattern) override { target_->Hint(pattern); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + private: + FSRandomAccessFile* target_; +}; + +class FSWritableFileWrapper : public FSWritableFile { + public: + explicit FSWritableFileWrapper(FSWritableFile* t) : target_(t) {} + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Append(data, options, dbg); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override { + return target_->PositionedAppend(data, offset, options, dbg); + } + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Truncate(size, options, dbg); + } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return target_->Close(options, dbg); + } + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override { + return target_->Flush(options, dbg); + } + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Sync(options, dbg); + } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Fsync(options, dbg); + } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override { + return target_->GetFileSize(options, dbg); + } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + IOStatus InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, + IODebugContext* dbg) override { + return target_->RangeSync(offset, nbytes, options, dbg); + } + + void PrepareWrite(size_t offset, size_t len, const IOOptions& options, + IODebugContext* dbg) override { + target_->PrepareWrite(offset, len, options, dbg); + } + + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Allocate(offset, len, options, dbg); + } + + private: + FSWritableFile* target_; +}; + +class FSRandomRWFileWrapper : public FSRandomRWFile { + public: + explicit FSRandomRWFileWrapper(FSRandomRWFile* target) : target_(target) {} + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + return target_->Write(offset, data, options, dbg); + } + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + return target_->Read(offset, n, options, result, scratch, dbg); + } + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override { + return target_->Flush(options, dbg); + } + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Sync(options, dbg); + } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Fsync(options, dbg); + } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return target_->Close(options, dbg); + } + + private: + FSRandomRWFile* target_; +}; + +class FSDirectoryWrapper : public FSDirectory { + public: + explicit FSDirectoryWrapper(FSDirectory* target) : target_(target) {} + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + return target_->Fsync(options, dbg); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + FSDirectory* target_; +}; + +// A utility routine: read contents of named file into *data +extern Status ReadFileToString(FileSystem* fs, const std::string& fname, + std::string* data); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/filter_policy.h b/src/rocksdb/include/rocksdb/filter_policy.h new file mode 100644 index 000000000..03d6471cf --- /dev/null +++ b/src/rocksdb/include/rocksdb/filter_policy.h @@ -0,0 +1,200 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A database can be configured with a custom FilterPolicy object. +// This object is responsible for creating a small filter from a set +// of keys. These filters are stored in rocksdb and are consulted +// automatically by rocksdb to decide whether or not to read some +// information from disk. In many cases, a filter can cut down the +// number of disk seeks form a handful to a single disk seek per +// DB::Get() call. +// +// Most people will want to use the builtin bloom filter support (see +// NewBloomFilterPolicy() below). + +#pragma once + +#include <stdlib.h> +#include <memory> +#include <stdexcept> +#include <string> +#include <vector> + +#include "rocksdb/advanced_options.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +struct BlockBasedTableOptions; + +// A class that takes a bunch of keys, then generates filter +class FilterBitsBuilder { + public: + virtual ~FilterBitsBuilder() {} + + // Add Key to filter, you could use any way to store the key. + // Such as: storing hashes or original keys + // Keys are in sorted order and duplicated keys are possible. + virtual void AddKey(const Slice& key) = 0; + + // Generate the filter using the keys that are added + // The return value of this function would be the filter bits, + // The ownership of actual data is set to buf + virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0; + + // Calculate num of keys that can be added and generate a filter + // <= the specified number of bytes. +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4702) // unreachable code +#endif + virtual int CalculateNumEntry(const uint32_t /*bytes*/) { +#ifndef ROCKSDB_LITE + throw std::runtime_error("CalculateNumEntry not Implemented"); +#else + abort(); +#endif + return 0; + } +#if defined(_MSC_VER) +#pragma warning(pop) +#endif +}; + +// A class that checks if a key can be in filter +// It should be initialized by Slice generated by BitsBuilder +class FilterBitsReader { + public: + virtual ~FilterBitsReader() {} + + // Check if the entry match the bits in filter + virtual bool MayMatch(const Slice& entry) = 0; + + // Check if an array of entries match the bits in filter + virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) { + for (int i = 0; i < num_keys; ++i) { + may_match[i] = MayMatch(*keys[i]); + } + } +}; + +// Contextual information passed to BloomFilterPolicy at filter building time. +// Used in overriding FilterPolicy::GetBuilderWithContext(). References other +// structs because this is expected to be a temporary, stack-allocated object. +struct FilterBuildingContext { + // This constructor is for internal use only and subject to change. + FilterBuildingContext(const BlockBasedTableOptions& table_options); + + // Options for the table being built + const BlockBasedTableOptions& table_options; + + // Name of the column family for the table (or empty string if unknown) + std::string column_family_name; + + // The compactions style in effect for the table + CompactionStyle compaction_style = kCompactionStyleLevel; + + // The table level at time of constructing the SST file, or -1 if unknown. + // (The table file could later be used at a different level.) + int level_at_creation = -1; + + // An optional logger for reporting errors, warnings, etc. + Logger* info_log = nullptr; +}; + +// We add a new format of filter block called full filter block +// This new interface gives you more space of customization +// +// For the full filter block, you can plug in your version by implement +// the FilterBitsBuilder and FilterBitsReader +// +// There are two sets of interface in FilterPolicy +// Set 1: CreateFilter, KeyMayMatch: used for blockbased filter +// Set 2: GetFilterBitsBuilder, GetFilterBitsReader, they are used for +// full filter. +// Set 1 MUST be implemented correctly, Set 2 is optional +// RocksDB would first try using functions in Set 2. if they return nullptr, +// it would use Set 1 instead. +// You can choose filter type in NewBloomFilterPolicy +class FilterPolicy { + public: + virtual ~FilterPolicy(); + + // Return the name of this policy. Note that if the filter encoding + // changes in an incompatible way, the name returned by this method + // must be changed. Otherwise, old incompatible filters may be + // passed to methods of this type. + virtual const char* Name() const = 0; + + // keys[0,n-1] contains a list of keys (potentially with duplicates) + // that are ordered according to the user supplied comparator. + // Append a filter that summarizes keys[0,n-1] to *dst. + // + // Warning: do not change the initial contents of *dst. Instead, + // append the newly constructed filter to *dst. + virtual void CreateFilter(const Slice* keys, int n, + std::string* dst) const = 0; + + // "filter" contains the data appended by a preceding call to + // CreateFilter() on this class. This method must return true if + // the key was in the list of keys passed to CreateFilter(). + // This method may return true or false if the key was not on the + // list, but it should aim to return false with a high probability. + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0; + + // Return a new FilterBitsBuilder for full or partitioned filter blocks, or + // nullptr if using block-based filter. + // NOTE: This function is only called by GetBuilderWithContext() below for + // custom FilterPolicy implementations. Thus, it is not necessary to + // override this function if overriding GetBuilderWithContext(). + virtual FilterBitsBuilder* GetFilterBitsBuilder() const { return nullptr; } + + // A newer variant of GetFilterBitsBuilder that allows a FilterPolicy + // to customize the builder for contextual constraints and hints. + // (Name changed to avoid triggering -Werror=overloaded-virtual.) + // If overriding GetFilterBitsBuilder() suffices, it is not necessary to + // override this function. + virtual FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext&) const { + return GetFilterBitsBuilder(); + } + + // Return a new FilterBitsReader for full or partitioned filter blocks, or + // nullptr if using block-based filter. + // As here, the input slice should NOT be deleted by FilterPolicy. + virtual FilterBitsReader* GetFilterBitsReader( + const Slice& /*contents*/) const { + return nullptr; + } +}; + +// Return a new filter policy that uses a bloom filter with approximately +// the specified number of bits per key. +// +// bits_per_key: average bits allocated per key in bloom filter. A good +// choice is 9.9, which yields a filter with ~ 1% false positive rate. +// When format_version < 5, the value will be rounded to the nearest +// integer. Recommend using no more than three decimal digits after the +// decimal point, as in 6.667. +// +// use_block_based_builder: use deprecated block based filter (true) rather +// than full or partitioned filter (false). +// +// Callers must delete the result after any database that is using the +// result has been closed. +// +// Note: if you are using a custom comparator that ignores some parts +// of the keys being compared, you must not use NewBloomFilterPolicy() +// and must provide your own FilterPolicy that also ignores the +// corresponding parts of the keys. For example, if the comparator +// ignores trailing spaces, it would be incorrect to use a +// FilterPolicy (like NewBloomFilterPolicy) that does not ignore +// trailing spaces in keys. +extern const FilterPolicy* NewBloomFilterPolicy( + double bits_per_key, bool use_block_based_builder = false); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/flush_block_policy.h b/src/rocksdb/include/rocksdb/flush_block_policy.h new file mode 100644 index 000000000..badc0808a --- /dev/null +++ b/src/rocksdb/include/rocksdb/flush_block_policy.h @@ -0,0 +1,61 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <string> +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class BlockBuilder; +struct Options; + +// FlushBlockPolicy provides a configurable way to determine when to flush a +// block in the block based tables, +class FlushBlockPolicy { + public: + // Keep track of the key/value sequences and return the boolean value to + // determine if table builder should flush current data block. + virtual bool Update(const Slice& key, const Slice& value) = 0; + + virtual ~FlushBlockPolicy() {} +}; + +class FlushBlockPolicyFactory { + public: + // Return the name of the flush block policy. + virtual const char* Name() const = 0; + + // Return a new block flush policy that flushes data blocks by data size. + // FlushBlockPolicy may need to access the metadata of the data block + // builder to determine when to flush the blocks. + // + // Callers must delete the result after any database that is using the + // result has been closed. + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& table_options, + const BlockBuilder& data_block_builder) const = 0; + + virtual ~FlushBlockPolicyFactory() {} +}; + +class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory { + public: + FlushBlockBySizePolicyFactory() {} + + const char* Name() const override { return "FlushBlockBySizePolicyFactory"; } + + FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& table_options, + const BlockBuilder& data_block_builder) const override; + + static FlushBlockPolicy* NewFlushBlockPolicy( + const uint64_t size, const int deviation, + const BlockBuilder& data_block_builder); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/io_status.h b/src/rocksdb/include/rocksdb/io_status.h new file mode 100644 index 000000000..a1de859ad --- /dev/null +++ b/src/rocksdb/include/rocksdb/io_status.h @@ -0,0 +1,232 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// An IOStatus encapsulates the result of an operation. It may indicate +// success, or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on an IOStatus without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same IOStatus must use +// external synchronization. + +#pragma once + +#include <string> +#include "rocksdb/slice.h" +#ifdef OS_WIN +#include <string.h> +#endif +#include <cstring> +#include "status.h" + +namespace ROCKSDB_NAMESPACE { + +class IOStatus : public Status { + public: + using Code = Status::Code; + using SubCode = Status::SubCode; + + enum IOErrorScope { + kIOErrorScopeFileSystem, + kIOErrorScopeFile, + kIOErrorScopeRange, + kIOErrorScopeMax, + }; + + // Create a success status. + IOStatus() : IOStatus(kOk, kNone) {} + ~IOStatus() {} + + // Copy the specified status. + IOStatus(const IOStatus& s); + IOStatus& operator=(const IOStatus& s); + IOStatus(IOStatus&& s) +#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900)) + noexcept +#endif + ; + IOStatus& operator=(IOStatus&& s) +#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900)) + noexcept +#endif + ; + bool operator==(const IOStatus& rhs) const; + bool operator!=(const IOStatus& rhs) const; + + void SetRetryable(bool retryable) { retryable_ = retryable; } + void SetDataLoss(bool data_loss) { data_loss_ = data_loss; } + void SetScope(IOErrorScope scope) { scope_ = scope; } + + bool GetRetryable() const { return retryable_; } + bool GetDataLoss() const { return data_loss_; } + IOErrorScope GetScope() const { return scope_; } + + // Return a success status. + static IOStatus OK() { return IOStatus(); } + + static IOStatus NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kNotSupported, msg, msg2); + } + static IOStatus NotSupported(SubCode msg = kNone) { + return IOStatus(kNotSupported, msg); + } + + // Return error status of an appropriate type. + static IOStatus NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kNotFound, msg, msg2); + } + // Fast path for not found without malloc; + static IOStatus NotFound(SubCode msg = kNone) { + return IOStatus(kNotFound, msg); + } + + static IOStatus Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kCorruption, msg, msg2); + } + static IOStatus Corruption(SubCode msg = kNone) { + return IOStatus(kCorruption, msg); + } + + static IOStatus InvalidArgument(const Slice& msg, + const Slice& msg2 = Slice()) { + return IOStatus(kInvalidArgument, msg, msg2); + } + static IOStatus InvalidArgument(SubCode msg = kNone) { + return IOStatus(kInvalidArgument, msg); + } + + static IOStatus IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kIOError, msg, msg2); + } + static IOStatus IOError(SubCode msg = kNone) { + return IOStatus(kIOError, msg); + } + + static IOStatus Busy(SubCode msg = kNone) { return IOStatus(kBusy, msg); } + static IOStatus Busy(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kBusy, msg, msg2); + } + + static IOStatus TimedOut(SubCode msg = kNone) { + return IOStatus(kTimedOut, msg); + } + static IOStatus TimedOut(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kTimedOut, msg, msg2); + } + + static IOStatus NoSpace() { return IOStatus(kIOError, kNoSpace); } + static IOStatus NoSpace(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kIOError, kNoSpace, msg, msg2); + } + + static IOStatus PathNotFound() { return IOStatus(kIOError, kPathNotFound); } + static IOStatus PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kIOError, kPathNotFound, msg, msg2); + } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + // std::string ToString() const; + + private: + friend IOStatus status_to_io_status(Status&&); + bool retryable_; + bool data_loss_; + IOErrorScope scope_; + + explicit IOStatus(Code _code, SubCode _subcode = kNone) + : Status(_code, _subcode), + retryable_(false), + data_loss_(false), + scope_(kIOErrorScopeFileSystem) {} + + IOStatus(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2); + IOStatus(Code _code, const Slice& msg, const Slice& msg2) + : IOStatus(_code, kNone, msg, msg2) {} +}; + +inline IOStatus::IOStatus(Code _code, SubCode _subcode, const Slice& msg, + const Slice& msg2) + : Status(_code, _subcode), + retryable_(false), + data_loss_(false), + scope_(kIOErrorScopeFileSystem) { + assert(code_ != kOk); + assert(subcode_ != kMaxSubCode); + const size_t len1 = msg.size(); + const size_t len2 = msg2.size(); + const size_t size = len1 + (len2 ? (2 + len2) : 0); + char* const result = new char[size + 1]; // +1 for null terminator + memcpy(result, msg.data(), len1); + if (len2) { + result[len1] = ':'; + result[len1 + 1] = ' '; + memcpy(result + len1 + 2, msg2.data(), len2); + } + result[size] = '\0'; // null terminator for C style string + state_ = result; +} + +inline IOStatus::IOStatus(const IOStatus& s) : Status(s.code_, s.subcode_) { + retryable_ = s.retryable_; + data_loss_ = s.data_loss_; + scope_ = s.scope_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); +} +inline IOStatus& IOStatus::operator=(const IOStatus& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + if (this != &s) { + code_ = s.code_; + subcode_ = s.subcode_; + retryable_ = s.retryable_; + data_loss_ = s.data_loss_; + scope_ = s.scope_; + delete[] state_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); + } + return *this; +} + +inline IOStatus::IOStatus(IOStatus&& s) +#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900)) + noexcept +#endif + : IOStatus() { + *this = std::move(s); +} + +inline IOStatus& IOStatus::operator=(IOStatus&& s) +#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900)) + noexcept +#endif +{ + if (this != &s) { + code_ = std::move(s.code_); + s.code_ = kOk; + subcode_ = std::move(s.subcode_); + s.subcode_ = kNone; + retryable_ = s.retryable_; + retryable_ = false; + data_loss_ = s.data_loss_; + data_loss_ = false; + scope_ = s.scope_; + scope_ = kIOErrorScopeFileSystem; + delete[] state_; + state_ = nullptr; + std::swap(state_, s.state_); + } + return *this; +} + +inline bool IOStatus::operator==(const IOStatus& rhs) const { + return (code_ == rhs.code_); +} + +inline bool IOStatus::operator!=(const IOStatus& rhs) const { + return !(*this == rhs); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/iostats_context.h b/src/rocksdb/include/rocksdb/iostats_context.h new file mode 100644 index 000000000..b31b6d70a --- /dev/null +++ b/src/rocksdb/include/rocksdb/iostats_context.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <stdint.h> +#include <string> + +#include "rocksdb/perf_level.h" + +// A thread local context for gathering io-stats efficiently and transparently. +// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats. + +namespace ROCKSDB_NAMESPACE { + +struct IOStatsContext { + // reset all io-stats counter to zero + void Reset(); + + std::string ToString(bool exclude_zero_counters = false) const; + + // the thread pool id + uint64_t thread_pool_id; + + // number of bytes that has been written. + uint64_t bytes_written; + // number of bytes that has been read. + uint64_t bytes_read; + + // time spent in open() and fopen(). + uint64_t open_nanos; + // time spent in fallocate(). + uint64_t allocate_nanos; + // time spent in write() and pwrite(). + uint64_t write_nanos; + // time spent in read() and pread() + uint64_t read_nanos; + // time spent in sync_file_range(). + uint64_t range_sync_nanos; + // time spent in fsync + uint64_t fsync_nanos; + // time spent in preparing write (fallocate etc). + uint64_t prepare_write_nanos; + // time spent in Logger::Logv(). + uint64_t logger_nanos; + // CPU time spent in write() and pwrite() + uint64_t cpu_write_nanos; + // CPU time spent in read() and pread() + uint64_t cpu_read_nanos; +}; + +// Get Thread-local IOStatsContext object pointer +IOStatsContext* get_iostats_context(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/iterator.h b/src/rocksdb/include/rocksdb/iterator.h new file mode 100644 index 000000000..2f8f1e385 --- /dev/null +++ b/src/rocksdb/include/rocksdb/iterator.h @@ -0,0 +1,119 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An iterator yields a sequence of key/value pairs from a source. +// The following class defines the interface. Multiple implementations +// are provided by this library. In particular, iterators are provided +// to access the contents of a Table or a DB. +// +// Multiple threads can invoke const methods on an Iterator without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Iterator must use +// external synchronization. + +#pragma once + +#include <string> +#include "rocksdb/cleanable.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Iterator : public Cleanable { + public: + Iterator() {} + // No copying allowed + Iterator(const Iterator&) = delete; + void operator=(const Iterator&) = delete; + + virtual ~Iterator() {} + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + // Always returns false if !status().ok(). + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target. + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + // All Seek*() methods clear any error status() that the iterator had prior to + // the call; after the seek, status() indicates only the error (if any) that + // happened during the seek, not any past errors. + virtual void Seek(const Slice& target) = 0; + + // Position at the last key in the source that at or before target. + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or before target. + virtual void SeekForPrev(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice value() const = 0; + + // If an error has occurred, return it. Else return an ok status. + // If non-blocking IO is requested and this operation cannot be + // satisfied without doing some IO, then this returns Status::Incomplete(). + virtual Status status() const = 0; + + // If supported, renew the iterator to represent the latest state. The + // iterator will be invalidated after the call. Not supported if + // ReadOptions.snapshot is given when creating the iterator. + virtual Status Refresh() { + return Status::NotSupported("Refresh() is not supported"); + } + + // Property "rocksdb.iterator.is-key-pinned": + // If returning "1", this means that the Slice returned by key() is valid + // as long as the iterator is not deleted. + // It is guaranteed to always return "1" if + // - Iterator created with ReadOptions::pin_data = true + // - DB tables were created with + // BlockBasedTableOptions::use_delta_encoding = false. + // Property "rocksdb.iterator.super-version-number": + // LSM version used by the iterator. The same format as DB Property + // kCurrentSuperVersionNumber. See its comment for more information. + // Property "rocksdb.iterator.internal-key": + // Get the user-key portion of the internal key at which the iteration + // stopped. + virtual Status GetProperty(std::string prop_name, std::string* prop); +}; + +// Return an empty iterator (yields nothing). +extern Iterator* NewEmptyIterator(); + +// Return an empty iterator with the specified status. +extern Iterator* NewErrorIterator(const Status& status); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/ldb_tool.h b/src/rocksdb/include/rocksdb/ldb_tool.h new file mode 100644 index 000000000..22ea7734f --- /dev/null +++ b/src/rocksdb/include/rocksdb/ldb_tool.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once +#ifndef ROCKSDB_LITE +#include <string> +#include <vector> +#include "rocksdb/db.h" +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { + +// An interface for converting a slice to a readable string +class SliceFormatter { + public: + virtual ~SliceFormatter() {} + virtual std::string Format(const Slice& s) const = 0; +}; + +// Options for customizing ldb tool (beyond the DB Options) +struct LDBOptions { + // Create LDBOptions with default values for all fields + LDBOptions(); + + // Key formatter that converts a slice to a readable string. + // Default: Slice::ToString() + std::shared_ptr<SliceFormatter> key_formatter; + + std::string print_help_header = "ldb - RocksDB Tool"; +}; + +class LDBTool { + public: + void Run( + int argc, char** argv, Options db_options = Options(), + const LDBOptions& ldb_options = LDBOptions(), + const std::vector<ColumnFamilyDescriptor>* column_families = nullptr); +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/listener.h b/src/rocksdb/include/rocksdb/listener.h new file mode 100644 index 000000000..d1c953f0f --- /dev/null +++ b/src/rocksdb/include/rocksdb/listener.h @@ -0,0 +1,491 @@ +// Copyright (c) 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +#pragma once + +#include <chrono> +#include <memory> +#include <string> +#include <unordered_map> +#include <vector> +#include "rocksdb/compaction_job_stats.h" +#include "rocksdb/status.h" +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>> + TablePropertiesCollection; + +class DB; +class ColumnFamilyHandle; +class Status; +struct CompactionJobStats; +enum CompressionType : unsigned char; + +enum class TableFileCreationReason { + kFlush, + kCompaction, + kRecovery, + kMisc, +}; + +struct TableFileCreationBriefInfo { + // the name of the database where the file was created + std::string db_name; + // the name of the column family where the file was created. + std::string cf_name; + // the path to the created file. + std::string file_path; + // the id of the job (which could be flush or compaction) that + // created the file. + int job_id; + // reason of creating the table. + TableFileCreationReason reason; +}; + +struct TableFileCreationInfo : public TableFileCreationBriefInfo { + TableFileCreationInfo() = default; + explicit TableFileCreationInfo(TableProperties&& prop) + : table_properties(prop) {} + // the size of the file. + uint64_t file_size; + // Detailed properties of the created file. + TableProperties table_properties; + // The status indicating whether the creation was successful or not. + Status status; +}; + +enum class CompactionReason : int { + kUnknown = 0, + // [Level] number of L0 files > level0_file_num_compaction_trigger + kLevelL0FilesNum, + // [Level] total size of level > MaxBytesForLevel() + kLevelMaxLevelSize, + // [Universal] Compacting for size amplification + kUniversalSizeAmplification, + // [Universal] Compacting for size ratio + kUniversalSizeRatio, + // [Universal] number of sorted runs > level0_file_num_compaction_trigger + kUniversalSortedRunNum, + // [FIFO] total size > max_table_files_size + kFIFOMaxSize, + // [FIFO] reduce number of files. + kFIFOReduceNumFiles, + // [FIFO] files with creation time < (current_time - interval) + kFIFOTtl, + // Manual compaction + kManualCompaction, + // DB::SuggestCompactRange() marked files for compaction + kFilesMarkedForCompaction, + // [Level] Automatic compaction within bottommost level to cleanup duplicate + // versions of same user key, usually due to a released snapshot. + kBottommostFiles, + // Compaction based on TTL + kTtl, + // According to the comments in flush_job.cc, RocksDB treats flush as + // a level 0 compaction in internal stats. + kFlush, + // Compaction caused by external sst file ingestion + kExternalSstIngestion, + // Compaction due to SST file being too old + kPeriodicCompaction, + // total number of compaction reasons, new reasons must be added above this. + kNumOfReasons, +}; + +enum class FlushReason : int { + kOthers = 0x00, + kGetLiveFiles = 0x01, + kShutDown = 0x02, + kExternalFileIngestion = 0x03, + kManualCompaction = 0x04, + kWriteBufferManager = 0x05, + kWriteBufferFull = 0x06, + kTest = 0x07, + kDeleteFiles = 0x08, + kAutoCompaction = 0x09, + kManualFlush = 0x0a, + kErrorRecovery = 0xb, +}; + +enum class BackgroundErrorReason { + kFlush, + kCompaction, + kWriteCallback, + kMemTable, +}; + +enum class WriteStallCondition { + kNormal, + kDelayed, + kStopped, +}; + +struct WriteStallInfo { + // the name of the column family + std::string cf_name; + // state of the write controller + struct { + WriteStallCondition cur; + WriteStallCondition prev; + } condition; +}; + +#ifndef ROCKSDB_LITE + +struct TableFileDeletionInfo { + // The name of the database where the file was deleted. + std::string db_name; + // The path to the deleted file. + std::string file_path; + // The id of the job which deleted the file. + int job_id; + // The status indicating whether the deletion was successful or not. + Status status; +}; + +struct FileOperationInfo { + using TimePoint = std::chrono::time_point<std::chrono::system_clock, + std::chrono::nanoseconds>; + + const std::string& path; + uint64_t offset; + size_t length; + const TimePoint& start_timestamp; + const TimePoint& finish_timestamp; + Status status; + FileOperationInfo(const std::string& _path, const TimePoint& start, + const TimePoint& finish) + : path(_path), start_timestamp(start), finish_timestamp(finish) {} +}; + +struct FlushJobInfo { + // the id of the column family + uint32_t cf_id; + // the name of the column family + std::string cf_name; + // the path to the newly created file + std::string file_path; + // the file number of the newly created file + uint64_t file_number; + // the oldest blob file referenced by the newly created file + uint64_t oldest_blob_file_number; + // the id of the thread that completed this flush job. + uint64_t thread_id; + // the job id, which is unique in the same thread. + int job_id; + // If true, then rocksdb is currently slowing-down all writes to prevent + // creating too many Level 0 files as compaction seems not able to + // catch up the write request speed. This indicates that there are + // too many files in Level 0. + bool triggered_writes_slowdown; + // If true, then rocksdb is currently blocking any writes to prevent + // creating more L0 files. This indicates that there are too many + // files in level 0. Compactions should try to compact L0 files down + // to lower levels as soon as possible. + bool triggered_writes_stop; + // The smallest sequence number in the newly created file + SequenceNumber smallest_seqno; + // The largest sequence number in the newly created file + SequenceNumber largest_seqno; + // Table properties of the table being flushed + TableProperties table_properties; + + FlushReason flush_reason; +}; + +struct CompactionFileInfo { + // The level of the file. + int level; + + // The file number of the file. + uint64_t file_number; + + // The file number of the oldest blob file this SST file references. + uint64_t oldest_blob_file_number; +}; + +struct CompactionJobInfo { + // the id of the column family where the compaction happened. + uint32_t cf_id; + // the name of the column family where the compaction happened. + std::string cf_name; + // the status indicating whether the compaction was successful or not. + Status status; + // the id of the thread that completed this compaction job. + uint64_t thread_id; + // the job id, which is unique in the same thread. + int job_id; + // the smallest input level of the compaction. + int base_input_level; + // the output level of the compaction. + int output_level; + + // The following variables contain information about compaction inputs + // and outputs. A file may appear in both the input and output lists + // if it was simply moved to a different level. The order of elements + // is the same across input_files and input_file_infos; similarly, it is + // the same across output_files and output_file_infos. + + // The names of the compaction input files. + std::vector<std::string> input_files; + + // Additional information about the compaction input files. + std::vector<CompactionFileInfo> input_file_infos; + + // The names of the compaction output files. + std::vector<std::string> output_files; + + // Additional information about the compaction output files. + std::vector<CompactionFileInfo> output_file_infos; + + // Table properties for input and output tables. + // The map is keyed by values from input_files and output_files. + TablePropertiesCollection table_properties; + + // Reason to run the compaction + CompactionReason compaction_reason; + + // Compression algorithm used for output files + CompressionType compression; + + // If non-null, this variable stores detailed information + // about this compaction. + CompactionJobStats stats; +}; + +struct MemTableInfo { + // the name of the column family to which memtable belongs + std::string cf_name; + // Sequence number of the first element that was inserted + // into the memtable. + SequenceNumber first_seqno; + // Sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into this + // memtable. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + SequenceNumber earliest_seqno; + // Total number of entries in memtable + uint64_t num_entries; + // Total number of deletes in memtable + uint64_t num_deletes; +}; + +struct ExternalFileIngestionInfo { + // the name of the column family + std::string cf_name; + // Path of the file outside the DB + std::string external_file_path; + // Path of the file inside the DB + std::string internal_file_path; + // The global sequence number assigned to keys in this file + SequenceNumber global_seqno; + // Table properties of the table being flushed + TableProperties table_properties; +}; + +// EventListener class contains a set of callback functions that will +// be called when specific RocksDB event happens such as flush. It can +// be used as a building block for developing custom features such as +// stats-collector or external compaction algorithm. +// +// Note that callback functions should not run for an extended period of +// time before the function returns, otherwise RocksDB may be blocked. +// For example, it is not suggested to do DB::CompactFiles() (as it may +// run for a long while) or issue many of DB::Put() (as Put may be blocked +// in certain cases) in the same thread in the EventListener callback. +// However, doing DB::CompactFiles() and DB::Put() in another thread is +// considered safe. +// +// [Threading] All EventListener callback will be called using the +// actual thread that involves in that specific event. For example, it +// is the RocksDB background flush thread that does the actual flush to +// call EventListener::OnFlushCompleted(). +// +// [Locking] All EventListener callbacks are designed to be called without +// the current thread holding any DB mutex. This is to prevent potential +// deadlock and performance issue when using EventListener callback +// in a complex way. +class EventListener { + public: + // A callback function to RocksDB which will be called whenever a + // registered RocksDB flushes a file. The default implementation is + // no-op. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + virtual void OnFlushCompleted(DB* /*db*/, + const FlushJobInfo& /*flush_job_info*/) {} + + // A callback function to RocksDB which will be called before a + // RocksDB starts to flush memtables. The default implementation is + // no-op. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + virtual void OnFlushBegin(DB* /*db*/, + const FlushJobInfo& /*flush_job_info*/) {} + + // A callback function for RocksDB which will be called whenever + // a SST file is deleted. Different from OnCompactionCompleted and + // OnFlushCompleted, this callback is designed for external logging + // service and thus only provide string parameters instead + // of a pointer to DB. Applications that build logic basic based + // on file creations and deletions is suggested to implement + // OnFlushCompleted and OnCompactionCompleted. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from the + // returned value. + virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {} + + // A callback function to RocksDB which will be called before a + // RocksDB starts to compact. The default implementation is + // no-op. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + virtual void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) {} + + // A callback function for RocksDB which will be called whenever + // a registered RocksDB compacts a file. The default implementation + // is a no-op. + // + // Note that this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // + // @param db a pointer to the rocksdb instance which just compacted + // a file. + // @param ci a reference to a CompactionJobInfo struct. 'ci' is released + // after this function is returned, and must be copied if it is needed + // outside of this function. + virtual void OnCompactionCompleted(DB* /*db*/, + const CompactionJobInfo& /*ci*/) {} + + // A callback function for RocksDB which will be called whenever + // a SST file is created. Different from OnCompactionCompleted and + // OnFlushCompleted, this callback is designed for external logging + // service and thus only provide string parameters instead + // of a pointer to DB. Applications that build logic basic based + // on file creations and deletions is suggested to implement + // OnFlushCompleted and OnCompactionCompleted. + // + // Historically it will only be called if the file is successfully created. + // Now it will also be called on failure case. User can check info.status + // to see if it succeeded or not. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {} + + // A callback function for RocksDB which will be called before + // a SST file is being created. It will follow by OnTableFileCreated after + // the creation finishes. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& /*info*/) {} + + // A callback function for RocksDB which will be called before + // a memtable is made immutable. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnMemTableSealed(const MemTableInfo& /*info*/) {} + + // A callback function for RocksDB which will be called before + // a column family handle is deleted. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // @param handle is a pointer to the column family handle to be deleted + // which will become a dangling pointer after the deletion. + virtual void OnColumnFamilyHandleDeletionStarted( + ColumnFamilyHandle* /*handle*/) {} + + // A callback function for RocksDB which will be called after an external + // file is ingested using IngestExternalFile. + // + // Note that the this function will run on the same thread as + // IngestExternalFile(), if this function is blocked, IngestExternalFile() + // will be blocked from finishing. + virtual void OnExternalFileIngested( + DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {} + + // A callback function for RocksDB which will be called before setting the + // background error status to a non-OK value. The new background error status + // is provided in `bg_error` and can be modified by the callback. E.g., a + // callback can suppress errors by resetting it to Status::OK(), thus + // preventing the database from entering read-only mode. We do not provide any + // guarantee when failed flushes/compactions will be rescheduled if the user + // suppresses an error. + // + // Note that this function can run on the same threads as flush, compaction, + // and user writes. So, it is extremely important not to perform heavy + // computations or blocking calls in this function. + virtual void OnBackgroundError(BackgroundErrorReason /* reason */, + Status* /* bg_error */) {} + + // A callback function for RocksDB which will be called whenever a change + // of superversion triggers a change of the stall conditions. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {} + + // A callback function for RocksDB which will be called whenever a file read + // operation finishes. + virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file write + // operation finishes. + virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {} + + // If true, the OnFileReadFinish and OnFileWriteFinish will be called. If + // false, then they won't be called. + virtual bool ShouldBeNotifiedOnFileIO() { return false; } + + // A callback function for RocksDB which will be called just before + // starting the automatic recovery process for recoverable background + // errors, such as NoSpace(). The callback can suppress the automatic + // recovery by setting *auto_recovery to false. The database will then + // have to be transitioned out of read-only mode by calling DB::Resume() + virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */, + Status /* bg_error */, + bool* /* auto_recovery */) {} + + // A callback function for RocksDB which will be called once the database + // is recovered from read-only mode after an error. When this is called, it + // means normal writes to the database can be issued and the user can + // initiate any further recovery actions needed + virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {} + + virtual ~EventListener() {} +}; + +#else + +class EventListener {}; +struct FlushJobInfo {}; + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/memory_allocator.h b/src/rocksdb/include/rocksdb/memory_allocator.h new file mode 100644 index 000000000..60256a977 --- /dev/null +++ b/src/rocksdb/include/rocksdb/memory_allocator.h @@ -0,0 +1,77 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/status.h" + +#include <memory> + +namespace ROCKSDB_NAMESPACE { + +// MemoryAllocator is an interface that a client can implement to supply custom +// memory allocation and deallocation methods. See rocksdb/cache.h for more +// information. +// All methods should be thread-safe. +class MemoryAllocator { + public: + virtual ~MemoryAllocator() = default; + + // Name of the cache allocator, printed in the log + virtual const char* Name() const = 0; + + // Allocate a block of at least size. Has to be thread-safe. + virtual void* Allocate(size_t size) = 0; + + // Deallocate previously allocated block. Has to be thread-safe. + virtual void Deallocate(void* p) = 0; + + // Returns the memory size of the block allocated at p. The default + // implementation that just returns the original allocation_size is fine. + virtual size_t UsableSize(void* /*p*/, size_t allocation_size) const { + // default implementation just returns the allocation size + return allocation_size; + } +}; + +struct JemallocAllocatorOptions { + // Jemalloc tcache cache allocations by size class. For each size class, + // it caches between 20 (for large size classes) to 200 (for small size + // classes). To reduce tcache memory usage in case the allocator is access + // by large number of threads, we can control whether to cache an allocation + // by its size. + bool limit_tcache_size = false; + + // Lower bound of allocation size to use tcache, if limit_tcache_size=true. + // When used with block cache, it is recommneded to set it to block_size/4. + size_t tcache_size_lower_bound = 1024; + + // Upper bound of allocation size to use tcache, if limit_tcache_size=true. + // When used with block cache, it is recommneded to set it to block_size. + size_t tcache_size_upper_bound = 16 * 1024; +}; + +// Generate memory allocators which allocates through Jemalloc and utilize +// MADV_DONTDUMP through madvice to exclude cache items from core dump. +// Applications can use the allocator with block cache to exclude block cache +// usage from core dump. +// +// Implementation details: +// The JemallocNodumpAllocator creates a delicated jemalloc arena, and all +// allocations of the JemallocNodumpAllocator is through the same arena. +// The memory allocator hooks memory allocation of the arena, and call +// madvice() with MADV_DONTDUMP flag to exclude the piece of memory from +// core dump. Side benefit of using single arena would be reduce of jemalloc +// metadata for some workload. +// +// To mitigate mutex contention for using one single arena, jemalloc tcache +// (thread-local cache) is enabled to cache unused allocations for future use. +// The tcache normally incur 0.5M extra memory usage per-thread. The usage +// can be reduce by limitting allocation sizes to cache. +extern Status NewJemallocNodumpAllocator( + JemallocAllocatorOptions& options, + std::shared_ptr<MemoryAllocator>* memory_allocator); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/memtablerep.h b/src/rocksdb/include/rocksdb/memtablerep.h new file mode 100644 index 000000000..49723264a --- /dev/null +++ b/src/rocksdb/include/rocksdb/memtablerep.h @@ -0,0 +1,385 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file contains the interface that must be implemented by any collection +// to be used as the backing store for a MemTable. Such a collection must +// satisfy the following properties: +// (1) It does not store duplicate items. +// (2) It uses MemTableRep::KeyComparator to compare items for iteration and +// equality. +// (3) It can be accessed concurrently by multiple readers and can support +// during reads. However, it needn't support multiple concurrent writes. +// (4) Items are never deleted. +// The liberal use of assertions is encouraged to enforce (1). +// +// The factory will be passed an MemTableAllocator object when a new MemTableRep +// is requested. +// +// Users can implement their own memtable representations. We include three +// types built in: +// - SkipListRep: This is the default; it is backed by a skip list. +// - HashSkipListRep: The memtable rep that is best used for keys that are +// structured like "prefix:suffix" where iteration within a prefix is +// common and iteration across different prefixes is rare. It is backed by +// a hash map where each bucket is a skip list. +// - VectorRep: This is backed by an unordered std::vector. On iteration, the +// vector is sorted. It is intelligent about sorting; once the MarkReadOnly() +// has been called, the vector will only be sorted once. It is optimized for +// random-write-heavy workloads. +// +// The last four implementations are designed for situations in which +// iteration over the entire collection is rare since doing so requires all the +// keys to be copied into a sorted data structure. + +#pragma once + +#include <rocksdb/slice.h> +#include <stdint.h> +#include <stdlib.h> +#include <memory> +#include <stdexcept> + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class Allocator; +class LookupKey; +class SliceTransform; +class Logger; + +typedef void* KeyHandle; + +extern Slice GetLengthPrefixedSlice(const char* data); + +class MemTableRep { + public: + // KeyComparator provides a means to compare keys, which are internal keys + // concatenated with values. + class KeyComparator { + public: + typedef ROCKSDB_NAMESPACE::Slice DecodedType; + + virtual DecodedType decode_key(const char* key) const { + // The format of key is frozen and can be terated as a part of the API + // contract. Refer to MemTable::Add for details. + return GetLengthPrefixedSlice(key); + } + + // Compare a and b. Return a negative value if a is less than b, 0 if they + // are equal, and a positive value if a is greater than b + virtual int operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const = 0; + + virtual int operator()(const char* prefix_len_key, + const Slice& key) const = 0; + + virtual ~KeyComparator() {} + }; + + explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {} + + // Allocate a buf of len size for storing key. The idea is that a + // specific memtable representation knows its underlying data structure + // better. By allowing it to allocate memory, it can possibly put + // correlated stuff in consecutive memory area to make processor + // prefetching more efficient. + virtual KeyHandle Allocate(const size_t len, char** buf); + + // Insert key into the collection. (The caller will pack key and value into a + // single buffer and pass that in as the parameter to Insert). + // REQUIRES: nothing that compares equal to key is currently in the + // collection, and no concurrent modifications to the table in progress + virtual void Insert(KeyHandle handle) = 0; + + // Same as ::Insert + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the <key, seq> already exists. + virtual bool InsertKey(KeyHandle handle) { + Insert(handle); + return true; + } + + // Same as Insert(), but in additional pass a hint to insert location for + // the key. If hint points to nullptr, a new hint will be populated. + // otherwise the hint will be updated to reflect the last insert location. + // + // Currently only skip-list based memtable implement the interface. Other + // implementations will fallback to Insert() by default. + virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) { + // Ignore the hint by default. + Insert(handle); + } + + // Same as ::InsertWithHint + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the <key, seq> already exists. + virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) { + InsertWithHint(handle, hint); + return true; + } + + // Same as ::InsertWithHint, but allow concurrnet write + // + // If hint points to nullptr, a new hint will be allocated on heap, otherwise + // the hint will be updated to reflect the last insert location. The hint is + // owned by the caller and it is the caller's responsibility to delete the + // hint later. + // + // Currently only skip-list based memtable implement the interface. Other + // implementations will fallback to InsertConcurrently() by default. + virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) { + // Ignore the hint by default. + InsertConcurrently(handle); + } + + // Same as ::InsertWithHintConcurrently + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the <key, seq> already exists. + virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) { + InsertWithHintConcurrently(handle, hint); + return true; + } + + // Like Insert(handle), but may be called concurrent with other calls + // to InsertConcurrently for other handles. + // + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the <key, seq> already exists. + virtual void InsertConcurrently(KeyHandle handle); + + // Same as ::InsertConcurrently + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the <key, seq> already exists. + virtual bool InsertKeyConcurrently(KeyHandle handle) { + InsertConcurrently(handle); + return true; + } + + // Returns true iff an entry that compares equal to key is in the collection. + virtual bool Contains(const char* key) const = 0; + + // Notify this table rep that it will no longer be added to. By default, + // does nothing. After MarkReadOnly() is called, this table rep will + // not be written to (ie No more calls to Allocate(), Insert(), + // or any writes done directly to entries accessed through the iterator.) + virtual void MarkReadOnly() {} + + // Notify this table rep that it has been flushed to stable storage. + // By default, does nothing. + // + // Invariant: MarkReadOnly() is called, before MarkFlushed(). + // Note that this method if overridden, should not run for an extended period + // of time. Otherwise, RocksDB may be blocked. + virtual void MarkFlushed() {} + + // Look up key from the mem table, since the first key in the mem table whose + // user_key matches the one given k, call the function callback_func(), with + // callback_args directly forwarded as the first parameter, and the mem table + // key as the second parameter. If the return value is false, then terminates. + // Otherwise, go through the next key. + // + // It's safe for Get() to terminate after having finished all the potential + // key for the k.user_key(), or not. + // + // Default: + // Get() function with a default value of dynamically construct an iterator, + // seek and call the call back function. + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)); + + virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/, + const Slice& /*end_key*/) { + return 0; + } + + // Report an approximation of how much memory has been used other than memory + // that was allocated through the allocator. Safe to call from any thread. + virtual size_t ApproximateMemoryUsage() = 0; + + virtual ~MemTableRep() {} + + // Iteration over the contents of a skip collection + class Iterator { + public: + // Initialize an iterator over the specified collection. + // The returned iterator is not valid. + // explicit Iterator(const MemTableRep* collection); + virtual ~Iterator() {} + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const = 0; + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const = 0; + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; + + // retreat to the first entry with a key <= target + virtual void SeekForPrev(const Slice& internal_key, + const char* memtable_key) = 0; + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() = 0; + }; + + // Return an iterator over the keys in this representation. + // arena: If not null, the arena needs to be used to allocate the Iterator. + // When destroying the iterator, the caller will not call "delete" + // but Iterator::~Iterator() directly. The destructor needs to destroy + // all the states but those allocated in arena. + virtual Iterator* GetIterator(Arena* arena = nullptr) = 0; + + // Return an iterator that has a special Seek semantics. The result of + // a Seek might only include keys with the same prefix as the target key. + // arena: If not null, the arena is used to allocate the Iterator. + // When destroying the iterator, the caller will not call "delete" + // but Iterator::~Iterator() directly. The destructor needs to destroy + // all the states but those allocated in arena. + virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) { + return GetIterator(arena); + } + + // Return true if the current MemTableRep supports merge operator. + // Default: true + virtual bool IsMergeOperatorSupported() const { return true; } + + // Return true if the current MemTableRep supports snapshot + // Default: true + virtual bool IsSnapshotSupported() const { return true; } + + protected: + // When *key is an internal key concatenated with the value, returns the + // user key. + virtual Slice UserKey(const char* key) const; + + Allocator* allocator_; +}; + +// This is the base class for all factories that are used by RocksDB to create +// new MemTableRep objects +class MemTableRepFactory { + public: + virtual ~MemTableRepFactory() {} + + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Allocator*, const SliceTransform*, + Logger* logger) = 0; + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& key_cmp, Allocator* allocator, + const SliceTransform* slice_transform, Logger* logger, + uint32_t /* column_family_id */) { + return CreateMemTableRep(key_cmp, allocator, slice_transform, logger); + } + + virtual const char* Name() const = 0; + + // Return true if the current MemTableRep supports concurrent inserts + // Default: false + virtual bool IsInsertConcurrentlySupported() const { return false; } + + // Return true if the current MemTableRep supports detecting duplicate + // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns + // false when if the <key,seq> already exists. + // Default: false + virtual bool CanHandleDuplicatedKey() const { return false; } +}; + +// This uses a skip list to store keys. It is the default. +// +// Parameters: +// lookahead: If non-zero, each iterator's seek operation will start the +// search from the previously visited record (doing at most 'lookahead' +// steps). This is an optimization for the access pattern including many +// seeks with consecutive keys. +class SkipListFactory : public MemTableRepFactory { + public: + explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {} + + using MemTableRepFactory::CreateMemTableRep; + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Allocator*, const SliceTransform*, + Logger* logger) override; + virtual const char* Name() const override { return "SkipListFactory"; } + + bool IsInsertConcurrentlySupported() const override { return true; } + + bool CanHandleDuplicatedKey() const override { return true; } + + private: + const size_t lookahead_; +}; + +#ifndef ROCKSDB_LITE +// This creates MemTableReps that are backed by an std::vector. On iteration, +// the vector is sorted. This is useful for workloads where iteration is very +// rare and writes are generally not issued after reads begin. +// +// Parameters: +// count: Passed to the constructor of the underlying std::vector of each +// VectorRep. On initialization, the underlying array will be at least count +// bytes reserved for usage. +class VectorRepFactory : public MemTableRepFactory { + const size_t count_; + + public: + explicit VectorRepFactory(size_t count = 0) : count_(count) {} + + using MemTableRepFactory::CreateMemTableRep; + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Allocator*, const SliceTransform*, + Logger* logger) override; + + virtual const char* Name() const override { return "VectorRepFactory"; } +}; + +// This class contains a fixed array of buckets, each +// pointing to a skiplist (null if the bucket is empty). +// bucket_count: number of fixed array buckets +// skiplist_height: the max height of the skiplist +// skiplist_branching_factor: probabilistic size ratio between adjacent +// link lists in the skiplist +extern MemTableRepFactory* NewHashSkipListRepFactory( + size_t bucket_count = 1000000, int32_t skiplist_height = 4, + int32_t skiplist_branching_factor = 4); + +// The factory is to create memtables based on a hash table: +// it contains a fixed array of buckets, each pointing to either a linked list +// or a skip list if number of entries inside the bucket exceeds +// threshold_use_skiplist. +// @bucket_count: number of fixed array buckets +// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc. +// Otherwise from huge page TLB. The user needs to reserve +// huge pages for it to be allocated, like: +// sysctl -w vm.nr_hugepages=20 +// See linux doc Documentation/vm/hugetlbpage.txt +// @bucket_entries_logging_threshold: if number of entries in one bucket +// exceeds this number, log about it. +// @if_log_bucket_dist_when_flash: if true, log distribution of number of +// entries when flushing. +// @threshold_use_skiplist: a bucket switches to skip list if number of +// entries exceed this parameter. +extern MemTableRepFactory* NewHashLinkListRepFactory( + size_t bucket_count = 50000, size_t huge_page_tlb_size = 0, + int bucket_entries_logging_threshold = 4096, + bool if_log_bucket_dist_when_flash = true, + uint32_t threshold_use_skiplist = 256); + +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/merge_operator.h b/src/rocksdb/include/rocksdb/merge_operator.h new file mode 100644 index 000000000..86f3d7260 --- /dev/null +++ b/src/rocksdb/include/rocksdb/merge_operator.h @@ -0,0 +1,257 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <deque> +#include <memory> +#include <string> +#include <vector> + +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class Logger; + +// The Merge Operator +// +// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only +// client knows. It could be numeric addition, list append, string +// concatenation, edit data structure, ... , anything. +// The library, on the other hand, is concerned with the exercise of this +// interface, at the right time (during get, iteration, compaction...) +// +// To use merge, the client needs to provide an object implementing one of +// the following interfaces: +// a) AssociativeMergeOperator - for most simple semantics (always take +// two values, and merge them into one value, which is then put back +// into rocksdb); numeric addition and string concatenation are examples; +// +// b) MergeOperator - the generic class for all the more abstract / complex +// operations; one method (FullMergeV2) to merge a Put/Delete value with a +// merge operand; and another method (PartialMerge) that merges multiple +// operands together. this is especially useful if your key values have +// complex structures but you would still like to support client-specific +// incremental updates. +// +// AssociativeMergeOperator is simpler to implement. MergeOperator is simply +// more powerful. +// +// Refer to rocksdb-merge wiki for more details and example implementations. +// +class MergeOperator { + public: + virtual ~MergeOperator() {} + static const char* Type() { return "MergeOperator"; } + + // Gives the client a way to express the read -> modify -> write semantics + // key: (IN) The key that's associated with this merge operation. + // Client could multiplex the merge operator based on it + // if the key space is partitioned and different subspaces + // refer to different types of data which have different + // merge operation semantics + // existing: (IN) null indicates that the key does not exist before this op + // operand_list:(IN) the sequence of merge operations to apply, front() first. + // new_value:(OUT) Client is responsible for filling the merge result here. + // The string that new_value is pointing to will be empty. + // logger: (IN) Client could use this to log errors during merge. + // + // Return true on success. + // All values passed in will be client-specific values. So if this method + // returns false, it is because client specified bad data or there was + // internal corruption. This will be treated as an error by the library. + // + // Also make use of the *logger for error messages. + virtual bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, + const std::deque<std::string>& /*operand_list*/, + std::string* /*new_value*/, Logger* /*logger*/) const { + // deprecated, please use FullMergeV2() + assert(false); + return false; + } + + struct MergeOperationInput { + explicit MergeOperationInput(const Slice& _key, + const Slice* _existing_value, + const std::vector<Slice>& _operand_list, + Logger* _logger) + : key(_key), + existing_value(_existing_value), + operand_list(_operand_list), + logger(_logger) {} + + // The key associated with the merge operation. + const Slice& key; + // The existing value of the current key, nullptr means that the + // value doesn't exist. + const Slice* existing_value; + // A list of operands to apply. + const std::vector<Slice>& operand_list; + // Logger could be used by client to log any errors that happen during + // the merge operation. + Logger* logger; + }; + + struct MergeOperationOutput { + explicit MergeOperationOutput(std::string& _new_value, + Slice& _existing_operand) + : new_value(_new_value), existing_operand(_existing_operand) {} + + // Client is responsible for filling the merge result here. + std::string& new_value; + // If the merge result is one of the existing operands (or existing_value), + // client can set this field to the operand (or existing_value) instead of + // using new_value. + Slice& existing_operand; + }; + + // This function applies a stack of merge operands in chrionological order + // on top of an existing value. There are two ways in which this method is + // being used: + // a) During Get() operation, it used to calculate the final value of a key + // b) During compaction, in order to collapse some operands with the based + // value. + // + // Note: The name of the method is somewhat misleading, as both in the cases + // of Get() or compaction it may be called on a subset of operands: + // K: 0 +1 +2 +7 +4 +5 2 +1 +2 + // ^ + // | + // snapshot + // In the example above, Get(K) operation will call FullMerge with a base + // value of 2 and operands [+1, +2]. Compaction process might decide to + // collapse the beginning of the history up to the snapshot by performing + // full Merge with base value of 0 and operands [+1, +2, +7, +3]. + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const; + + // This function performs merge(left_op, right_op) + // when both the operands are themselves merge operation types + // that you would have passed to a DB::Merge() call in the same order + // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)). + // + // PartialMerge should combine them into a single merge operation that is + // saved into *new_value, and then it should return true. + // *new_value should be constructed such that a call to + // DB::Merge(key, *new_value) would yield the same result as a call + // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op). + // + // The string that new_value is pointing to will be empty. + // + // The default implementation of PartialMergeMulti will use this function + // as a helper, for backward compatibility. Any successor class of + // MergeOperator should either implement PartialMerge or PartialMergeMulti, + // although implementing PartialMergeMulti is suggested as it is in general + // more effective to merge multiple operands at a time instead of two + // operands at a time. + // + // If it is impossible or infeasible to combine the two operations, + // leave new_value unchanged and return false. The library will + // internally keep track of the operations, and apply them in the + // correct order once a base-value (a Put/Delete/End-of-Database) is seen. + // + // TODO: Presently there is no way to differentiate between error/corruption + // and simply "return false". For now, the client should simply return + // false in any case it cannot perform partial-merge, regardless of reason. + // If there is corruption in the data, handle it in the FullMergeV2() function + // and return false there. The default implementation of PartialMerge will + // always return false. + virtual bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/, + const Slice& /*right_operand*/, + std::string* /*new_value*/, + Logger* /*logger*/) const { + return false; + } + + // This function performs merge when all the operands are themselves merge + // operation types that you would have passed to a DB::Merge() call in the + // same order (front() first) + // (i.e. DB::Merge(key, operand_list[0]), followed by + // DB::Merge(key, operand_list[1]), ...) + // + // PartialMergeMulti should combine them into a single merge operation that is + // saved into *new_value, and then it should return true. *new_value should + // be constructed such that a call to DB::Merge(key, *new_value) would yield + // the same result as subquential individual calls to DB::Merge(key, operand) + // for each operand in operand_list from front() to back(). + // + // The string that new_value is pointing to will be empty. + // + // The PartialMergeMulti function will be called when there are at least two + // operands. + // + // In the default implementation, PartialMergeMulti will invoke PartialMerge + // multiple times, where each time it only merges two operands. Developers + // should either implement PartialMergeMulti, or implement PartialMerge which + // is served as the helper function of the default PartialMergeMulti. + virtual bool PartialMergeMulti(const Slice& key, + const std::deque<Slice>& operand_list, + std::string* new_value, Logger* logger) const; + + // The name of the MergeOperator. Used to check for MergeOperator + // mismatches (i.e., a DB created with one MergeOperator is + // accessed using a different MergeOperator) + // TODO: the name is currently not stored persistently and thus + // no checking is enforced. Client is responsible for providing + // consistent MergeOperator between DB opens. + virtual const char* Name() const = 0; + + // Determines whether the PartialMerge can be called with just a single + // merge operand. + // Override and return true for allowing a single operand. PartialMerge + // and PartialMergeMulti should be overridden and implemented + // correctly to properly handle a single operand. + virtual bool AllowSingleOperand() const { return false; } + + // Allows to control when to invoke a full merge during Get. + // This could be used to limit the number of merge operands that are looked at + // during a point lookup, thereby helping in limiting the number of levels to + // read from. + // Doesn't help with iterators. + // + // Note: the merge operands are passed to this function in the reversed order + // relative to how they were merged (passed to FullMerge or FullMergeV2) + // for performance reasons, see also: + // https://github.com/facebook/rocksdb/issues/3865 + virtual bool ShouldMerge(const std::vector<Slice>& /*operands*/) const { + return false; + } +}; + +// The simpler, associative merge operator. +class AssociativeMergeOperator : public MergeOperator { + public: + ~AssociativeMergeOperator() override {} + + // Gives the client a way to express the read -> modify -> write semantics + // key: (IN) The key that's associated with this merge operation. + // existing_value:(IN) null indicates the key does not exist before this op + // value: (IN) the value to update/merge the existing_value with + // new_value: (OUT) Client is responsible for filling the merge result + // here. The string that new_value is pointing to will be empty. + // logger: (IN) Client could use this to log errors during merge. + // + // Return true on success. + // All values passed in will be client-specific values. So if this method + // returns false, it is because client specified bad data or there was + // internal corruption. The client should assume that this will be treated + // as an error by the library. + virtual bool Merge(const Slice& key, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* logger) const = 0; + + private: + // Default implementations of the MergeOperator functions + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + + bool PartialMerge(const Slice& key, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* logger) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/metadata.h b/src/rocksdb/include/rocksdb/metadata.h new file mode 100644 index 000000000..f1a9ee60e --- /dev/null +++ b/src/rocksdb/include/rocksdb/metadata.h @@ -0,0 +1,151 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <stdint.h> + +#include <limits> +#include <string> +#include <vector> + +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { +struct ColumnFamilyMetaData; +struct LevelMetaData; +struct SstFileMetaData; + +// The metadata that describes a column family. +struct ColumnFamilyMetaData { + ColumnFamilyMetaData() : size(0), file_count(0), name("") {} + ColumnFamilyMetaData(const std::string& _name, uint64_t _size, + const std::vector<LevelMetaData>&& _levels) + : size(_size), name(_name), levels(_levels) {} + + // The size of this column family in bytes, which is equal to the sum of + // the file size of its "levels". + uint64_t size; + // The number of files in this column family. + size_t file_count; + // The name of the column family. + std::string name; + // The metadata of all levels in this column family. + std::vector<LevelMetaData> levels; +}; + +// The metadata that describes a level. +struct LevelMetaData { + LevelMetaData(int _level, uint64_t _size, + const std::vector<SstFileMetaData>&& _files) + : level(_level), size(_size), files(_files) {} + + // The level which this meta data describes. + const int level; + // The size of this level in bytes, which is equal to the sum of + // the file size of its "files". + const uint64_t size; + // The metadata of all sst files in this level. + const std::vector<SstFileMetaData> files; +}; + +// The metadata that describes a SST file. +struct SstFileMetaData { + SstFileMetaData() + : size(0), + file_number(0), + smallest_seqno(0), + largest_seqno(0), + num_reads_sampled(0), + being_compacted(false), + num_entries(0), + num_deletions(0), + oldest_blob_file_number(0) {} + + SstFileMetaData(const std::string& _file_name, uint64_t _file_number, + const std::string& _path, size_t _size, + SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno, + const std::string& _smallestkey, + const std::string& _largestkey, uint64_t _num_reads_sampled, + bool _being_compacted, uint64_t _oldest_blob_file_number, + uint64_t _oldest_ancester_time, uint64_t _file_creation_time, + std::string& _file_checksum, + std::string& _file_checksum_func_name) + : size(_size), + name(_file_name), + file_number(_file_number), + db_path(_path), + smallest_seqno(_smallest_seqno), + largest_seqno(_largest_seqno), + smallestkey(_smallestkey), + largestkey(_largestkey), + num_reads_sampled(_num_reads_sampled), + being_compacted(_being_compacted), + num_entries(0), + num_deletions(0), + oldest_blob_file_number(_oldest_blob_file_number), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time), + file_checksum(_file_checksum), + file_checksum_func_name(_file_checksum_func_name) {} + + // File size in bytes. + size_t size; + // The name of the file. + std::string name; + // The id of the file. + uint64_t file_number; + // The full path where the file locates. + std::string db_path; + + SequenceNumber smallest_seqno; // Smallest sequence number in file. + SequenceNumber largest_seqno; // Largest sequence number in file. + std::string smallestkey; // Smallest user defined key in the file. + std::string largestkey; // Largest user defined key in the file. + uint64_t num_reads_sampled; // How many times the file is read. + bool being_compacted; // true if the file is currently being compacted. + + uint64_t num_entries; + uint64_t num_deletions; + + uint64_t oldest_blob_file_number; // The id of the oldest blob file + // referenced by the file. + // An SST file may be generated by compactions whose input files may + // in turn be generated by earlier compactions. The creation time of the + // oldest SST file that is the compaction ancester of this file. + // The timestamp is provided Env::GetCurrentTime(). + // 0 if the information is not available. + uint64_t oldest_ancester_time; + // Timestamp when the SST file is created, provided by Env::GetCurrentTime(). + // 0 if the information is not available. + uint64_t file_creation_time; + + // The checksum of a SST file, the value is decided by the file content and + // the checksum algorithm used for this SST file. The checksum function is + // identified by the file_checksum_func_name. If the checksum function is + // not specified, file_checksum is "0" by default. + std::string file_checksum; + + // The name of the checksum function used to generate the file checksum + // value. If file checksum is not enabled (e.g., sst_file_checksum_func is + // null), file_checksum_func_name is UnknownFileChecksumFuncName, which is + // "Unknown". + std::string file_checksum_func_name; +}; + +// The full set of metadata associated with each SST file. +struct LiveFileMetaData : SstFileMetaData { + std::string column_family_name; // Name of the column family + int level; // Level at which this file resides. + LiveFileMetaData() : column_family_name(), level(0) {} +}; + +// Metadata returned as output from ExportColumnFamily() and used as input to +// CreateColumnFamiliesWithImport(). +struct ExportImportFilesMetaData { + std::string db_comparator_name; // Used to safety check at import. + std::vector<LiveFileMetaData> files; // Vector of file metadata. +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/options.h b/src/rocksdb/include/rocksdb/options.h new file mode 100644 index 000000000..f5d44fb74 --- /dev/null +++ b/src/rocksdb/include/rocksdb/options.h @@ -0,0 +1,1587 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <stddef.h> +#include <stdint.h> +#include <limits> +#include <memory> +#include <string> +#include <unordered_map> +#include <vector> + +#include "rocksdb/advanced_options.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/listener.h" +#include "rocksdb/universal_compaction.h" +#include "rocksdb/version.h" +#include "rocksdb/write_buffer_manager.h" + +#ifdef max +#undef max +#endif + +namespace ROCKSDB_NAMESPACE { + +class Cache; +class CompactionFilter; +class CompactionFilterFactory; +class Comparator; +class ConcurrentTaskLimiter; +class Env; +enum InfoLogLevel : unsigned char; +class SstFileManager; +class FilterPolicy; +class Logger; +class MergeOperator; +class Snapshot; +class MemTableRepFactory; +class RateLimiter; +class Slice; +class Statistics; +class InternalKeyComparator; +class WalFilter; +class FileSystem; + +// DB contents are stored in a set of blocks, each of which holds a +// sequence of key,value pairs. Each block may be compressed before +// being stored in a file. The following enum describes which +// compression method (if any) is used to compress a block. +enum CompressionType : unsigned char { + // NOTE: do not change the values of existing entries, as these are + // part of the persistent format on disk. + kNoCompression = 0x0, + kSnappyCompression = 0x1, + kZlibCompression = 0x2, + kBZip2Compression = 0x3, + kLZ4Compression = 0x4, + kLZ4HCCompression = 0x5, + kXpressCompression = 0x6, + kZSTD = 0x7, + + // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than + // 0.8.0 or consider a possibility of downgrading the service or copying + // the database files to another service running with an older version of + // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will + // eventually remove the option from the public API. + kZSTDNotFinalCompression = 0x40, + + // kDisableCompressionOption is used to disable some compression options. + kDisableCompressionOption = 0xff, +}; + +struct Options; +struct DbPath; + +struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { + // The function recovers options to a previous version. Only 4.6 or later + // versions are supported. + ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4, + int rocksdb_minor_version = 6); + + // Some functions that make it easier to optimize RocksDB + // Use this if your DB is very small (like under 1GB) and you don't want to + // spend lots of memory for memtables. + // An optional cache object is passed in to be used as the block cache + ColumnFamilyOptions* OptimizeForSmallDb( + std::shared_ptr<Cache>* cache = nullptr); + + // Use this if you don't need to keep the data sorted, i.e. you'll never use + // an iterator, only Put() and Get() API calls + // + // Not supported in ROCKSDB_LITE + ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb); + + // Default values for some parameters in ColumnFamilyOptions are not + // optimized for heavy workloads and big datasets, which means you might + // observe write stalls under some conditions. As a starting point for tuning + // RocksDB options, use the following two functions: + // * OptimizeLevelStyleCompaction -- optimizes level style compaction + // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction + // Universal style compaction is focused on reducing Write Amplification + // Factor for big data sets, but increases Space Amplification. You can learn + // more about the different styles here: + // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide + // Make sure to also call IncreaseParallelism(), which will provide the + // biggest performance gains. + // Note: we might use more memory than memtable_memory_budget during high + // write rate period + // + // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE + ColumnFamilyOptions* OptimizeLevelStyleCompaction( + uint64_t memtable_memory_budget = 512 * 1024 * 1024); + ColumnFamilyOptions* OptimizeUniversalStyleCompaction( + uint64_t memtable_memory_budget = 512 * 1024 * 1024); + + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator = BytewiseComparator(); + + // REQUIRES: The client must provide a merge operator if Merge operation + // needs to be accessed. Calling Merge on a DB without a merge operator + // would result in Status::NotSupported. The client must ensure that the + // merge operator supplied here has the same name and *exactly* the same + // semantics as the merge operator provided to previous open calls on + // the same DB. The only exception is reserved for upgrade, where a DB + // previously without a merge operator is introduced to Merge operation + // for the first time. It's necessary to specify a merge operator when + // opening the DB in this case. + // Default: nullptr + std::shared_ptr<MergeOperator> merge_operator = nullptr; + + // A single CompactionFilter instance to call into during compaction. + // Allows an application to modify/delete a key-value during background + // compaction. + // + // If the client requires a new compaction filter to be used for different + // compaction runs, it can specify compaction_filter_factory instead of this + // option. The client should specify only one of the two. + // compaction_filter takes precedence over compaction_filter_factory if + // client specifies both. + // + // If multithreaded compaction is being used, the supplied CompactionFilter + // instance may be used from different threads concurrently and so should be + // thread-safe. + // + // Default: nullptr + const CompactionFilter* compaction_filter = nullptr; + + // This is a factory that provides compaction filter objects which allow + // an application to modify/delete a key-value during background compaction. + // + // A new filter will be created on each compaction run. If multithreaded + // compaction is being used, each created CompactionFilter will only be used + // from a single thread and so does not need to be thread-safe. + // + // Default: nullptr + std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to max_write_buffer_number write buffers may be held in memory + // at the same time, + // so you may wish to adjust this parameter to control memory usage. + // Also, a larger write buffer will result in a longer recovery time + // the next time the database is opened. + // + // Note that write_buffer_size is enforced per column family. + // See db_write_buffer_size for sharing memory across column families. + // + // Default: 64MB + // + // Dynamically changeable through SetOptions() API + size_t write_buffer_size = 64 << 20; + + // Compress blocks using the specified compression algorithm. + // + // Default: kSnappyCompression, if it's supported. If snappy is not linked + // with the library, the default is kNoCompression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + // + // If you do not set `compression_opts.level`, or set it to + // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the + // default corresponding to `compression` as follows: + // + // - kZSTD: 3 + // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1) + // - kLZ4HCCompression: 0 + // - For all others, we do not specify a compression level + // + // Dynamically changeable through SetOptions() API + CompressionType compression; + + // Compression algorithm that will be used for the bottommost level that + // contain files. + // + // Default: kDisableCompressionOption (Disabled) + CompressionType bottommost_compression = kDisableCompressionOption; + + // different options for compression algorithms used by bottommost_compression + // if it is enabled. To enable it, please see the definition of + // CompressionOptions. + CompressionOptions bottommost_compression_opts; + + // different options for compression algorithms + CompressionOptions compression_opts; + + // Number of files to trigger level-0 compaction. A value <0 means that + // level-0 compaction will not be triggered by number of files at all. + // + // Default: 4 + // + // Dynamically changeable through SetOptions() API + int level0_file_num_compaction_trigger = 4; + + // If non-nullptr, use the specified function to determine the + // prefixes for keys. These prefixes will be placed in the filter. + // Depending on the workload, this can reduce the number of read-IOP + // cost for scans when a prefix is passed via ReadOptions to + // db.NewIterator(). For prefix filtering to work properly, + // "prefix_extractor" and "comparator" must be such that the following + // properties hold: + // + // 1) key.starts_with(prefix(key)) + // 2) Compare(prefix(key), key) <= 0. + // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0 + // 4) prefix(prefix(key)) == prefix(key) + // + // Default: nullptr + std::shared_ptr<const SliceTransform> prefix_extractor = nullptr; + + // Control maximum total data size for a level. + // max_bytes_for_level_base is the max total for level-1. + // Maximum number of bytes for level L can be calculated as + // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1)) + // For example, if max_bytes_for_level_base is 200MB, and if + // max_bytes_for_level_multiplier is 10, total data size for level-1 + // will be 200MB, total file size for level-2 will be 2GB, + // and total file size for level-3 will be 20GB. + // + // Default: 256MB. + // + // Dynamically changeable through SetOptions() API + uint64_t max_bytes_for_level_base = 256 * 1048576; + + // Deprecated. + uint64_t snap_refresh_nanos = 0; + + // Disable automatic compactions. Manual compactions can still + // be issued on this column family + // + // Dynamically changeable through SetOptions() API + bool disable_auto_compactions = false; + + // This is a factory that provides TableFactory objects. + // Default: a block-based table factory that provides a default + // implementation of TableBuilder and TableReader with default + // BlockBasedTableOptions. + std::shared_ptr<TableFactory> table_factory; + + // A list of paths where SST files for this column family + // can be put into, with its target size. Similar to db_paths, + // newer data is placed into paths specified earlier in the + // vector while older data gradually moves to paths specified + // later in the vector. + // Note that, if a path is supplied to multiple column + // families, it would have files and total size from all + // the column families combined. User should provision for the + // total size(from all the column families) in such cases. + // + // If left empty, db_paths will be used. + // Default: empty + std::vector<DbPath> cf_paths; + + // Compaction concurrent thread limiter for the column family. + // If non-nullptr, use given concurrent thread limiter to control + // the max outstanding compaction tasks. Limiter can be shared with + // multiple column families across db instances. + // + // Default: nullptr + std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter = nullptr; + + // Create ColumnFamilyOptions with default values for all fields + ColumnFamilyOptions(); + // Create ColumnFamilyOptions from Options + explicit ColumnFamilyOptions(const Options& options); + + void Dump(Logger* log) const; +}; + +enum class WALRecoveryMode : char { + // Original levelDB recovery + // We tolerate incomplete record in trailing data on all logs + // Use case : This is legacy behavior + kTolerateCorruptedTailRecords = 0x00, + // Recover from clean shutdown + // We don't expect to find any corruption in the WAL + // Use case : This is ideal for unit tests and rare applications that + // can require high consistency guarantee + kAbsoluteConsistency = 0x01, + // Recover to point-in-time consistency (default) + // We stop the WAL playback on discovering WAL inconsistency + // Use case : Ideal for systems that have disk controller cache like + // hard disk, SSD without super capacitor that store related data + kPointInTimeRecovery = 0x02, + // Recovery after a disaster + // We ignore any corruption in the WAL and try to salvage as much data as + // possible + // Use case : Ideal for last ditch effort to recover data or systems that + // operate with low grade unrelated data + kSkipAnyCorruptedRecords = 0x03, +}; + +struct DbPath { + std::string path; + uint64_t target_size; // Target size of total files under the path, in byte. + + DbPath() : target_size(0) {} + DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {} +}; + +struct DBOptions { + // The function recovers options to the option as in version 4.6. + DBOptions* OldDefaults(int rocksdb_major_version = 4, + int rocksdb_minor_version = 6); + + // Some functions that make it easier to optimize RocksDB + + // Use this if your DB is very small (like under 1GB) and you don't want to + // spend lots of memory for memtables. + // An optional cache object is passed in for the memory of the + // memtable to cost to + DBOptions* OptimizeForSmallDb(std::shared_ptr<Cache>* cache = nullptr); + +#ifndef ROCKSDB_LITE + // By default, RocksDB uses only one background thread for flush and + // compaction. Calling this function will set it up such that total of + // `total_threads` is used. Good value for `total_threads` is the number of + // cores. You almost definitely want to call this function if your system is + // bottlenecked by RocksDB. + DBOptions* IncreaseParallelism(int total_threads = 16); +#endif // ROCKSDB_LITE + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing = false; + + // If true, missing column families will be automatically created. + // Default: false + bool create_missing_column_families = false; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists = false; + + // If true, RocksDB will aggressively check consistency of the data. + // Also, if any of the writes to the database fails (Put, Delete, Merge, + // Write), the database will switch to read-only mode and fail all other + // Write operations. + // In most cases you want this to be set to true. + // Default: true + bool paranoid_checks = true; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. In the near + // future, support for doing storage operations such as read/write files + // through env will be deprecated in favor of file_system (see below) + // Default: Env::Default() + Env* env = Env::Default(); + + // Use the specified object to interact with the storage to + // read/write files. This is in addition to env. This option should be used + // if the desired storage subsystem provides a FileSystem implementation. + std::shared_ptr<FileSystem> file_system = nullptr; + + // Use to control write rate of flush and compaction. Flush has higher + // priority than compaction. Rate limiting is disabled if nullptr. + // If rate limiter is enabled, bytes_per_sync is set to 1MB by default. + // Default: nullptr + std::shared_ptr<RateLimiter> rate_limiter = nullptr; + + // Use to track SST files and control their file deletion rate. + // + // Features: + // - Throttle the deletion rate of the SST files. + // - Keep track the total size of all SST files. + // - Set a maximum allowed space limit for SST files that when reached + // the DB wont do any further flushes or compactions and will set the + // background error. + // - Can be shared between multiple dbs. + // Limitations: + // - Only track and throttle deletes of SST files in + // first db_path (db_name if db_paths is empty). + // + // Default: nullptr + std::shared_ptr<SstFileManager> sst_file_manager = nullptr; + + // Any internal progress/error information generated by the db will + // be written to info_log if it is non-nullptr, or to a file stored + // in the same directory as the DB contents if info_log is nullptr. + // Default: nullptr + std::shared_ptr<Logger> info_log = nullptr; + +#ifdef NDEBUG + InfoLogLevel info_log_level = INFO_LEVEL; +#else + InfoLogLevel info_log_level = DEBUG_LEVEL; +#endif // NDEBUG + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set. Value -1 means + // files opened are always kept open. You can estimate number of files based + // on target_file_size_base and target_file_size_multiplier for level-based + // compaction. For universal-style compaction, you can usually set it to -1. + // + // Default: -1 + // + // Dynamically changeable through SetDBOptions() API. + int max_open_files = -1; + + // If max_open_files is -1, DB will open all files on DB::Open(). You can + // use this option to increase the number of threads used to open the files. + // Default: 16 + int max_file_opening_threads = 16; + + // Once write-ahead logs exceed this size, we will start forcing the flush of + // column families whose memtables are backed by the oldest live WAL file + // (i.e. the ones that are causing all the space amplification). If set to 0 + // (default), we will dynamically choose the WAL size limit to be + // [sum of all write_buffer_size * max_write_buffer_number] * 4 + // This option takes effect only when there are more than one column family as + // otherwise the wal size is dictated by the write_buffer_size. + // + // Default: 0 + // + // Dynamically changeable through SetDBOptions() API. + uint64_t max_total_wal_size = 0; + + // If non-null, then we should collect metrics about database operations + std::shared_ptr<Statistics> statistics = nullptr; + + // By default, writes to stable storage use fdatasync (on platforms + // where this function is available). If this option is true, + // fsync is used instead. + // + // fsync and fdatasync are equally safe for our purposes and fdatasync is + // faster, so it is rarely necessary to set this option. It is provided + // as a workaround for kernel/filesystem bugs, such as one that affected + // fdatasync with ext4 in kernel versions prior to 3.7. + bool use_fsync = false; + + // A list of paths where SST files can be put into, with its target size. + // Newer data is placed into paths specified earlier in the vector while + // older data gradually moves to paths specified later in the vector. + // + // For example, you have a flash device with 10GB allocated for the DB, + // as well as a hard drive of 2TB, you should config it to be: + // [{"/flash_path", 10GB}, {"/hard_drive", 2TB}] + // + // The system will try to guarantee data under each path is close to but + // not larger than the target size. But current and future file sizes used + // by determining where to place a file are based on best-effort estimation, + // which means there is a chance that the actual size under the directory + // is slightly more than target size under some workloads. User should give + // some buffer room for those cases. + // + // If none of the paths has sufficient room to place a file, the file will + // be placed to the last path anyway, despite to the target size. + // + // Placing newer data to earlier paths is also best-efforts. User should + // expect user files to be placed in higher levels in some extreme cases. + // + // If left empty, only one path will be used, which is db_name passed when + // opening the DB. + // Default: empty + std::vector<DbPath> db_paths; + + // This specifies the info LOG dir. + // If it is empty, the log files will be in the same dir as data. + // If it is non empty, the log files will be in the specified dir, + // and the db data dir's absolute path will be used as the log file + // name's prefix. + std::string db_log_dir = ""; + + // This specifies the absolute dir path for write-ahead logs (WAL). + // If it is empty, the log files will be in the same dir as data, + // dbname is used as the data dir by default + // If it is non empty, the log files will be in kept the specified dir. + // When destroying the db, + // all log files in wal_dir and the dir itself is deleted + std::string wal_dir = ""; + + // The periodicity when obsolete files get deleted. The default + // value is 6 hours. The files that get out of scope by compaction + // process will still get automatically delete on every compaction, + // regardless of this setting + // + // Default: 6 hours + // + // Dynamically changeable through SetDBOptions() API. + uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000; + + // Maximum number of concurrent background jobs (compactions and flushes). + // + // Default: 2 + // + // Dynamically changeable through SetDBOptions() API. + int max_background_jobs = 2; + + // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the + // value of max_background_jobs. This option is ignored. + // + // Dynamically changeable through SetDBOptions() API. + int base_background_compactions = -1; + + // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the + // value of max_background_jobs. For backwards compatibility we will set + // `max_background_jobs = max_background_compactions + max_background_flushes` + // in the case where user sets at least one of `max_background_compactions` or + // `max_background_flushes` (we replace -1 by 1 in case one option is unset). + // + // Maximum number of concurrent background compaction jobs, submitted to + // the default LOW priority thread pool. + // + // If you're increasing this, also consider increasing number of threads in + // LOW priority thread pool. For more information, see + // Env::SetBackgroundThreads + // + // Default: -1 + // + // Dynamically changeable through SetDBOptions() API. + int max_background_compactions = -1; + + // This value represents the maximum number of threads that will + // concurrently perform a compaction job by breaking it into multiple, + // smaller ones that are run simultaneously. + // Default: 1 (i.e. no subcompactions) + uint32_t max_subcompactions = 1; + + // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the + // value of max_background_jobs. For backwards compatibility we will set + // `max_background_jobs = max_background_compactions + max_background_flushes` + // in the case where user sets at least one of `max_background_compactions` or + // `max_background_flushes`. + // + // Maximum number of concurrent background memtable flush jobs, submitted by + // default to the HIGH priority thread pool. If the HIGH priority thread pool + // is configured to have zero threads, flush jobs will share the LOW priority + // thread pool with compaction jobs. + // + // It is important to use both thread pools when the same Env is shared by + // multiple db instances. Without a separate pool, long running compaction + // jobs could potentially block memtable flush jobs of other db instances, + // leading to unnecessary Put stalls. + // + // If you're increasing this, also consider increasing number of threads in + // HIGH priority thread pool. For more information, see + // Env::SetBackgroundThreads + // Default: -1 + int max_background_flushes = -1; + + // Specify the maximal size of the info log file. If the log file + // is larger than `max_log_file_size`, a new info log file will + // be created. + // If max_log_file_size == 0, all logs will be written to one + // log file. + size_t max_log_file_size = 0; + + // Time for the info log file to roll (in seconds). + // If specified with non-zero value, log file will be rolled + // if it has been active longer than `log_file_time_to_roll`. + // Default: 0 (disabled) + // Not supported in ROCKSDB_LITE mode! + size_t log_file_time_to_roll = 0; + + // Maximal info log files to be kept. + // Default: 1000 + size_t keep_log_file_num = 1000; + + // Recycle log files. + // If non-zero, we will reuse previously written log files for new + // logs, overwriting the old data. The value indicates how many + // such files we will keep around at any point in time for later + // use. This is more efficient because the blocks are already + // allocated and fdatasync does not need to update the inode after + // each write. + // Default: 0 + size_t recycle_log_file_num = 0; + + // manifest file is rolled over on reaching this limit. + // The older manifest file be deleted. + // The default value is 1GB so that the manifest file can grow, but not + // reach the limit of storage capacity. + uint64_t max_manifest_file_size = 1024 * 1024 * 1024; + + // Number of shards used for table cache. + int table_cache_numshardbits = 6; + + // NOT SUPPORTED ANYMORE + // int table_cache_remove_scan_count_limit; + + // The following two fields affect how archived logs will be deleted. + // 1. If both set to 0, logs will be deleted asap and will not get into + // the archive. + // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + // WAL files will be checked every 10 min and if total size is greater + // then WAL_size_limit_MB, they will be deleted starting with the + // earliest until size_limit is met. All empty files will be deleted. + // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + // WAL files will be checked every WAL_ttl_seconds / 2 and those that + // are older than WAL_ttl_seconds will be deleted. + // 4. If both are not 0, WAL files will be checked every 10 min and both + // checks will be performed with ttl being first. + uint64_t WAL_ttl_seconds = 0; + uint64_t WAL_size_limit_MB = 0; + + // Number of bytes to preallocate (via fallocate) the manifest + // files. Default is 4mb, which is reasonable to reduce random IO + // as well as prevent overallocation for mounts that preallocate + // large amounts of data (such as xfs's allocsize option). + size_t manifest_preallocation_size = 4 * 1024 * 1024; + + // Allow the OS to mmap file for reading sst tables. Default: false + bool allow_mmap_reads = false; + + // Allow the OS to mmap file for writing. + // DB::SyncWAL() only works if this is set to false. + // Default: false + bool allow_mmap_writes = false; + + // Enable direct I/O mode for read/write + // they may or may not improve performance depending on the use case + // + // Files will be opened in "direct I/O" mode + // which means that data r/w from the disk will not be cached or + // buffered. The hardware buffer of the devices may however still + // be used. Memory mapped files are not impacted by these parameters. + + // Use O_DIRECT for user and compaction reads. + // When true, we also force new_table_reader_for_compaction_inputs to true. + // Default: false + // Not supported in ROCKSDB_LITE mode! + bool use_direct_reads = false; + + // Use O_DIRECT for writes in background flush and compactions. + // Default: false + // Not supported in ROCKSDB_LITE mode! + bool use_direct_io_for_flush_and_compaction = false; + + // If false, fallocate() calls are bypassed + bool allow_fallocate = true; + + // Disable child process inherit open files. Default: true + bool is_fd_close_on_exec = true; + + // NOT SUPPORTED ANYMORE -- this options is no longer used + bool skip_log_error_on_recovery = false; + + // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec + // + // Default: 600 (10 min) + // + // Dynamically changeable through SetDBOptions() API. + unsigned int stats_dump_period_sec = 600; + + // if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec + // Default: 600 + unsigned int stats_persist_period_sec = 600; + + // If true, automatically persist stats to a hidden column family (column + // family name: ___rocksdb_stats_history___) every + // stats_persist_period_sec seconds; otherwise, write to an in-memory + // struct. User can query through `GetStatsHistory` API. + // If user attempts to create a column family with the same name on a DB + // which have previously set persist_stats_to_disk to true, the column family + // creation will fail, but the hidden column family will survive, as well as + // the previously persisted statistics. + // When peristing stats to disk, the stat name will be limited at 100 bytes. + // Default: false + bool persist_stats_to_disk = false; + + // if not zero, periodically take stats snapshots and store in memory, the + // memory size for stats snapshots is capped at stats_history_buffer_size + // Default: 1MB + size_t stats_history_buffer_size = 1024 * 1024; + + // If set true, will hint the underlying file system that the file + // access pattern is random, when a sst file is opened. + // Default: true + bool advise_random_on_open = true; + + // Amount of data to build up in memtables across all column + // families before writing to disk. + // + // This is distinct from write_buffer_size, which enforces a limit + // for a single memtable. + // + // This feature is disabled by default. Specify a non-zero value + // to enable it. + // + // Default: 0 (disabled) + size_t db_write_buffer_size = 0; + + // The memory usage of memtable will report to this object. The same object + // can be passed into multiple DBs and it will track the sum of size of all + // the DBs. If the total size of all live memtables of all the DBs exceeds + // a limit, a flush will be triggered in the next DB to which the next write + // is issued. + // + // If the object is only passed to one DB, the behavior is the same as + // db_write_buffer_size. When write_buffer_manager is set, the value set will + // override db_write_buffer_size. + // + // This feature is disabled by default. Specify a non-zero value + // to enable it. + // + // Default: null + std::shared_ptr<WriteBufferManager> write_buffer_manager = nullptr; + + // Specify the file access pattern once a compaction is started. + // It will be applied to all input files of a compaction. + // Default: NORMAL + enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED }; + AccessHint access_hint_on_compaction_start = NORMAL; + + // If true, always create a new file descriptor and new table reader + // for compaction inputs. Turn this parameter on may introduce extra + // memory usage in the table reader, if it allocates extra memory + // for indexes. This will allow file descriptor prefetch options + // to be set for compaction input files and not to impact file + // descriptors for the same file used by user queries. + // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks + // for this mode if using block-based table. + // + // Default: false + // This flag has no affect on the behavior of compaction and plan to delete + // in the future. + bool new_table_reader_for_compaction_inputs = false; + + // If non-zero, we perform bigger reads when doing compaction. If you're + // running RocksDB on spinning disks, you should set this to at least 2MB. + // That way RocksDB's compaction is doing sequential instead of random reads. + // + // When non-zero, we also force new_table_reader_for_compaction_inputs to + // true. + // + // Default: 0 + // + // Dynamically changeable through SetDBOptions() API. + size_t compaction_readahead_size = 0; + + // This is a maximum buffer size that is used by WinMmapReadableFile in + // unbuffered disk I/O mode. We need to maintain an aligned buffer for + // reads. We allow the buffer to grow until the specified value and then + // for bigger requests allocate one shot buffers. In unbuffered mode we + // always bypass read-ahead buffer at ReadaheadRandomAccessFile + // When read-ahead is required we then make use of compaction_readahead_size + // value and always try to read ahead. With read-ahead we always + // pre-allocate buffer to the size instead of growing it up to a limit. + // + // This option is currently honored only on Windows + // + // Default: 1 Mb + // + // Special value: 0 - means do not maintain per instance buffer. Allocate + // per request buffer and avoid locking. + size_t random_access_max_buffer_size = 1024 * 1024; + + // This is the maximum buffer size that is used by WritableFileWriter. + // On Windows, we need to maintain an aligned buffer for writes. + // We allow the buffer to grow until it's size hits the limit in buffered + // IO and fix the buffer size when using direct IO to ensure alignment of + // write requests if the logical sector size is unusual + // + // Default: 1024 * 1024 (1 MB) + // + // Dynamically changeable through SetDBOptions() API. + size_t writable_file_max_buffer_size = 1024 * 1024; + + // Use adaptive mutex, which spins in the user space before resorting + // to kernel. This could reduce context switch when the mutex is not + // heavily contended. However, if the mutex is hot, we could end up + // wasting spin time. + // Default: false + bool use_adaptive_mutex = false; + + // Create DBOptions with default values for all fields + DBOptions(); + // Create DBOptions from Options + explicit DBOptions(const Options& options); + + void Dump(Logger* log) const; + + // Allows OS to incrementally sync files to disk while they are being + // written, asynchronously, in the background. This operation can be used + // to smooth out write I/Os over time. Users shouldn't rely on it for + // persistency guarantee. + // Issue one request for every bytes_per_sync written. 0 turns it off. + // + // You may consider using rate_limiter to regulate write rate to device. + // When rate limiter is enabled, it automatically enables bytes_per_sync + // to 1MB. + // + // This option applies to table files + // + // Default: 0, turned off + // + // Note: DOES NOT apply to WAL files. See wal_bytes_per_sync instead + // Dynamically changeable through SetDBOptions() API. + uint64_t bytes_per_sync = 0; + + // Same as bytes_per_sync, but applies to WAL files + // + // Default: 0, turned off + // + // Dynamically changeable through SetDBOptions() API. + uint64_t wal_bytes_per_sync = 0; + + // When true, guarantees WAL files have at most `wal_bytes_per_sync` + // bytes submitted for writeback at any given time, and SST files have at most + // `bytes_per_sync` bytes pending writeback at any given time. This can be + // used to handle cases where processing speed exceeds I/O speed during file + // generation, which can lead to a huge sync when the file is finished, even + // with `bytes_per_sync` / `wal_bytes_per_sync` properly configured. + // + // - If `sync_file_range` is supported it achieves this by waiting for any + // prior `sync_file_range`s to finish before proceeding. In this way, + // processing (compression, etc.) can proceed uninhibited in the gap + // between `sync_file_range`s, and we block only when I/O falls behind. + // - Otherwise the `WritableFile::Sync` method is used. Note this mechanism + // always blocks, thus preventing the interleaving of I/O and processing. + // + // Note: Enabling this option does not provide any additional persistence + // guarantees, as it may use `sync_file_range`, which does not write out + // metadata. + // + // Default: false + bool strict_bytes_per_sync = false; + + // A vector of EventListeners whose callback functions will be called + // when specific RocksDB event happens. + std::vector<std::shared_ptr<EventListener>> listeners; + + // If true, then the status of the threads involved in this DB will + // be tracked and available via GetThreadList() API. + // + // Default: false + bool enable_thread_tracking = false; + + // The limited write rate to DB if soft_pending_compaction_bytes_limit or + // level0_slowdown_writes_trigger is triggered, or we are writing to the + // last mem table allowed and we allow more than 3 mem tables. It is + // calculated using size of user write requests before compression. + // RocksDB may decide to slow down more if the compaction still + // gets behind further. + // If the value is 0, we will infer a value from `rater_limiter` value + // if it is not empty, or 16MB if `rater_limiter` is empty. Note that + // if users change the rate in `rate_limiter` after DB is opened, + // `delayed_write_rate` won't be adjusted. + // + // Unit: byte per second. + // + // Default: 0 + // + // Dynamically changeable through SetDBOptions() API. + uint64_t delayed_write_rate = 0; + + // By default, a single write thread queue is maintained. The thread gets + // to the head of the queue becomes write batch group leader and responsible + // for writing to WAL and memtable for the batch group. + // + // If enable_pipelined_write is true, separate write thread queue is + // maintained for WAL write and memtable write. A write thread first enter WAL + // writer queue and then memtable writer queue. Pending thread on the WAL + // writer queue thus only have to wait for previous writers to finish their + // WAL writing but not the memtable writing. Enabling the feature may improve + // write throughput and reduce latency of the prepare phase of two-phase + // commit. + // + // Default: false + bool enable_pipelined_write = false; + + // Setting unordered_write to true trades higher write throughput with + // relaxing the immutability guarantee of snapshots. This violates the + // repeatability one expects from ::Get from a snapshot, as well as + // ::MultiGet and Iterator's consistent-point-in-time view property. + // If the application cannot tolerate the relaxed guarantees, it can implement + // its own mechanisms to work around that and yet benefit from the higher + // throughput. Using TransactionDB with WRITE_PREPARED write policy and + // two_write_queues=true is one way to achieve immutable snapshots despite + // unordered_write. + // + // By default, i.e., when it is false, rocksdb does not advance the sequence + // number for new snapshots unless all the writes with lower sequence numbers + // are already finished. This provides the immutability that we except from + // snapshots. Moreover, since Iterator and MultiGet internally depend on + // snapshots, the snapshot immutability results into Iterator and MultiGet + // offering consistent-point-in-time view. If set to true, although + // Read-Your-Own-Write property is still provided, the snapshot immutability + // property is relaxed: the writes issued after the snapshot is obtained (with + // larger sequence numbers) will be still not visible to the reads from that + // snapshot, however, there still might be pending writes (with lower sequence + // number) that will change the state visible to the snapshot after they are + // landed to the memtable. + // + // Default: false + bool unordered_write = false; + + // If true, allow multi-writers to update mem tables in parallel. + // Only some memtable_factory-s support concurrent writes; currently it + // is implemented only for SkipListFactory. Concurrent memtable writes + // are not compatible with inplace_update_support or filter_deletes. + // It is strongly recommended to set enable_write_thread_adaptive_yield + // if you are going to use this feature. + // + // Default: true + bool allow_concurrent_memtable_write = true; + + // If true, threads synchronizing with the write batch group leader will + // wait for up to write_thread_max_yield_usec before blocking on a mutex. + // This can substantially improve throughput for concurrent workloads, + // regardless of whether allow_concurrent_memtable_write is enabled. + // + // Default: true + bool enable_write_thread_adaptive_yield = true; + + // The maximum limit of number of bytes that are written in a single batch + // of WAL or memtable write. It is followed when the leader write size + // is larger than 1/8 of this limit. + // + // Default: 1 MB + uint64_t max_write_batch_group_size_bytes = 1 << 20; + + // The maximum number of microseconds that a write operation will use + // a yielding spin loop to coordinate with other write threads before + // blocking on a mutex. (Assuming write_thread_slow_yield_usec is + // set properly) increasing this value is likely to increase RocksDB + // throughput at the expense of increased CPU usage. + // + // Default: 100 + uint64_t write_thread_max_yield_usec = 100; + + // The latency in microseconds after which a std::this_thread::yield + // call (sched_yield on Linux) is considered to be a signal that + // other processes or threads would like to use the current core. + // Increasing this makes writer threads more likely to take CPU + // by spinning, which will show up as an increase in the number of + // involuntary context switches. + // + // Default: 3 + uint64_t write_thread_slow_yield_usec = 3; + + // If true, then DB::Open() will not update the statistics used to optimize + // compaction decision by loading table properties from many files. + // Turning off this feature will improve DBOpen time especially in + // disk environment. + // + // Default: false + bool skip_stats_update_on_db_open = false; + + // If true, then DB::Open() will not fetch and check sizes of all sst files. + // This may significantly speed up startup if there are many sst files, + // especially when using non-default Env with expensive GetFileSize(). + // We'll still check that all required sst files exist. + // If paranoid_checks is false, this option is ignored, and sst files are + // not checked at all. + // + // Default: false + bool skip_checking_sst_file_sizes_on_db_open = false; + + // Recovery mode to control the consistency while replaying WAL + // Default: kPointInTimeRecovery + WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + + // if set to false then recovery will fail when a prepared + // transaction is encountered in the WAL + bool allow_2pc = false; + + // A global cache for table-level rows. + // Default: nullptr (disabled) + // Not supported in ROCKSDB_LITE mode! + std::shared_ptr<Cache> row_cache = nullptr; + +#ifndef ROCKSDB_LITE + // A filter object supplied to be invoked while processing write-ahead-logs + // (WALs) during recovery. The filter provides a way to inspect log + // records, ignoring a particular record or skipping replay. + // The filter is invoked at startup and is invoked from a single-thread + // currently. + WalFilter* wal_filter = nullptr; +#endif // ROCKSDB_LITE + + // If true, then DB::Open / CreateColumnFamily / DropColumnFamily + // / SetOptions will fail if options file is not detected or properly + // persisted. + // + // DEFAULT: false + bool fail_if_options_file_error = false; + + // If true, then print malloc stats together with rocksdb.stats + // when printing to LOG. + // DEFAULT: false + bool dump_malloc_stats = false; + + // By default RocksDB replay WAL logs and flush them on DB open, which may + // create very small SST files. If this option is enabled, RocksDB will try + // to avoid (but not guarantee not to) flush during recovery. Also, existing + // WAL logs will be kept, so that if crash happened before flush, we still + // have logs to recover from. + // + // DEFAULT: false + bool avoid_flush_during_recovery = false; + + // By default RocksDB will flush all memtables on DB close if there are + // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup + // DB close. Unpersisted data WILL BE LOST. + // + // DEFAULT: false + // + // Dynamically changeable through SetDBOptions() API. + bool avoid_flush_during_shutdown = false; + + // Set this option to true during creation of database if you want + // to be able to ingest behind (call IngestExternalFile() skipping keys + // that already exist, rather than overwriting matching keys). + // Setting this option to true will affect 2 things: + // 1) Disable some internal optimizations around SST file compression + // 2) Reserve bottom-most level for ingested files only. + // 3) Note that num_levels should be >= 3 if this option is turned on. + // + // DEFAULT: false + // Immutable. + bool allow_ingest_behind = false; + + // Needed to support differential snapshots. + // If set to true then DB will only process deletes with sequence number + // less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts). + // Clients are responsible to periodically call this method to advance + // the cutoff time. If this method is never called and preserve_deletes + // is set to true NO deletes will ever be processed. + // At the moment this only keeps normal deletes, SingleDeletes will + // not be preserved. + // DEFAULT: false + // Immutable (TODO: make it dynamically changeable) + bool preserve_deletes = false; + + // If enabled it uses two queues for writes, one for the ones with + // disable_memtable and one for the ones that also write to memtable. This + // allows the memtable writes not to lag behind other writes. It can be used + // to optimize MySQL 2PC in which only the commits, which are serial, write to + // memtable. + bool two_write_queues = false; + + // If true WAL is not flushed automatically after each write. Instead it + // relies on manual invocation of FlushWAL to write the WAL buffer to its + // file. + bool manual_wal_flush = false; + + // If true, RocksDB supports flushing multiple column families and committing + // their results atomically to MANIFEST. Note that it is not + // necessary to set atomic_flush to true if WAL is always enabled since WAL + // allows the database to be restored to the last persistent state in WAL. + // This option is useful when there are column families with writes NOT + // protected by WAL. + // For manual flush, application has to specify which column families to + // flush atomically in DB::Flush. + // For auto-triggered flush, RocksDB atomically flushes ALL column families. + // + // Currently, any WAL-enabled writes after atomic flush may be replayed + // independently if the process crashes later and tries to recover. + bool atomic_flush = false; + + // If true, working thread may avoid doing unnecessary and long-latency + // operation (such as deleting obsolete files directly or deleting memtable) + // and will instead schedule a background job to do it. + // Use it if you're latency-sensitive. + // If set to true, takes precedence over + // ReadOptions::background_purge_on_iterator_cleanup. + bool avoid_unnecessary_blocking_io = false; + + // Historically DB ID has always been stored in Identity File in DB folder. + // If this flag is true, the DB ID is written to Manifest file in addition + // to the Identity file. By doing this 2 problems are solved + // 1. We don't checksum the Identity file where as Manifest file is. + // 2. Since the source of truth for DB is Manifest file DB ID will sit with + // the source of truth. Previously the Identity file could be copied + // independent of Manifest and that can result in wrong DB ID. + // We recommend setting this flag to true. + // Default: false + bool write_dbid_to_manifest = false; + + // The number of bytes to prefetch when reading the log. This is mostly useful + // for reading a remotely located log, as it can save the number of + // round-trips. If 0, then the prefetching is disabled. + // + // Default: 0 + size_t log_readahead_size = 0; + + // If user does NOT provide SST file checksum function, the SST file checksum + // will NOT be used. The single checksum instance are shared by options and + // file writers. Make sure the algorithm is thread safe. + // + // Default: nullptr + std::shared_ptr<FileChecksumFunc> sst_file_checksum_func = nullptr; +}; + +// Options to control the behavior of a database (passed to DB::Open) +struct Options : public DBOptions, public ColumnFamilyOptions { + // Create an Options object with default values for all fields. + Options() : DBOptions(), ColumnFamilyOptions() {} + + Options(const DBOptions& db_options, + const ColumnFamilyOptions& column_family_options) + : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {} + + // The function recovers options to the option as in version 4.6. + Options* OldDefaults(int rocksdb_major_version = 4, + int rocksdb_minor_version = 6); + + void Dump(Logger* log) const; + + void DumpCFOptions(Logger* log) const; + + // Some functions that make it easier to optimize RocksDB + + // Set appropriate parameters for bulk loading. + // The reason that this is a function that returns "this" instead of a + // constructor is to enable chaining of multiple similar calls in the future. + // + + // All data will be in level 0 without any automatic compaction. + // It's recommended to manually call CompactRange(NULL, NULL) before reading + // from the database, because otherwise the read can be very slow. + Options* PrepareForBulkLoad(); + + // Use this if your DB is very small (like under 1GB) and you don't want to + // spend lots of memory for memtables. + Options* OptimizeForSmallDb(); +}; + +// +// An application can issue a read request (via Get/Iterators) and specify +// if that read should process data that ALREADY resides on a specified cache +// level. For example, if an application specifies kBlockCacheTier then the +// Get call will process data that is already processed in the memtable or +// the block cache. It will not page in data from the OS cache or data that +// resides in storage. +enum ReadTier { + kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage + kBlockCacheTier = 0x1, // data in memtable or block cache + kPersistedTier = 0x2, // persisted data. When WAL is disabled, this option + // will skip data in memtable. + // Note that this ReadTier currently only supports + // Get and MultiGet and does not support iterators. + kMemtableTier = 0x3 // data in memtable. used for memtable-only iterators. +}; + +// Options that control read operations +struct ReadOptions { + // If "snapshot" is non-nullptr, read as of the supplied snapshot + // (which must belong to the DB that is being read and which must + // not have been released). If "snapshot" is nullptr, use an implicit + // snapshot of the state at the beginning of this read operation. + // Default: nullptr + const Snapshot* snapshot; + + // `iterate_lower_bound` defines the smallest key at which the backward + // iterator can return an entry. Once the bound is passed, Valid() will be + // false. `iterate_lower_bound` is inclusive ie the bound value is a valid + // entry. + // + // If prefix_extractor is not null, the Seek target and `iterate_lower_bound` + // need to have the same prefix. This is because ordering is not guaranteed + // outside of prefix domain. + // + // Default: nullptr + const Slice* iterate_lower_bound; + + // "iterate_upper_bound" defines the extent upto which the forward iterator + // can returns entries. Once the bound is reached, Valid() will be false. + // "iterate_upper_bound" is exclusive ie the bound value is + // not a valid entry. If prefix_extractor is not null, the Seek target + // and iterate_upper_bound need to have the same prefix. + // This is because ordering is not guaranteed outside of prefix domain. + // + // Default: nullptr + const Slice* iterate_upper_bound; + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file. The readahead starts at 8KB and doubles on every + // additional read upto 256KB. + // This option can help if most of the range scans are large, and if it is + // determined that a larger readahead than that enabled by auto-readahead is + // needed. + // Using a large readahead size (> 2MB) can typically improve the performance + // of forward iteration on spinning disks. + // Default: 0 + size_t readahead_size; + + // A threshold for the number of keys that can be skipped before failing an + // iterator seek as incomplete. The default value of 0 should be used to + // never fail a request as incomplete, even on skipping too many keys. + // Default: 0 + uint64_t max_skippable_internal_keys; + + // Specify if this read request should process data that ALREADY + // resides on a particular cache. If the required data is not + // found at the specified cache, then Status::Incomplete is returned. + // Default: kReadAllTier + ReadTier read_tier; + + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: true + bool verify_checksums; + + // Should the "data block"/"index block"" read for this iteration be placed in + // block cache? + // Callers may wish to set this field to false for bulk scans. + // This would help not to the change eviction order of existing items in the + // block cache. Default: true + bool fill_cache; + + // Specify to create a tailing iterator -- a special iterator that has a + // view of the complete database (i.e. it can also be used to read newly + // added data) and is optimized for sequential reads. It will return records + // that were inserted into the database after the creation of the iterator. + // Default: false + // Not supported in ROCKSDB_LITE mode! + bool tailing; + + // This options is not used anymore. It was to turn on a functionality that + // has been removed. + bool managed; + + // Enable a total order seek regardless of index format (e.g. hash index) + // used in the table. Some table format (e.g. plain table) may not support + // this option. + // If true when calling Get(), we also skip prefix bloom when reading from + // block based table. It provides a way to read existing data after + // changing implementation of prefix extractor. + bool total_order_seek; + + // When true, by default use total_order_seek = true, and RocksDB can + // selectively enable prefix seek mode if won't generate a different result + // from total_order_seek, based on seek key, and iterator upper bound. + // Not suppported in ROCKSDB_LITE mode, in the way that even with value true + // prefix mode is not used. + bool auto_prefix_mode; + + // Enforce that the iterator only iterates over the same prefix as the seek. + // This option is effective only for prefix seeks, i.e. prefix_extractor is + // non-null for the column family and total_order_seek is false. Unlike + // iterate_upper_bound, prefix_same_as_start only works within a prefix + // but in both directions. + // Default: false + bool prefix_same_as_start; + + // Keep the blocks loaded by the iterator pinned in memory as long as the + // iterator is not deleted, If used when reading from tables created with + // BlockBasedTableOptions::use_delta_encoding = false, + // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to + // return 1. + // Default: false + bool pin_data; + + // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we + // schedule a background job in the flush job queue and delete obsolete files + // in background. + // Default: false + bool background_purge_on_iterator_cleanup; + + // If true, keys deleted using the DeleteRange() API will be visible to + // readers until they are naturally deleted during compaction. This improves + // read performance in DBs with many range deletions. + // Default: false + bool ignore_range_deletions; + + // A callback to determine whether relevant keys for this scan exist in a + // given table based on the table's properties. The callback is passed the + // properties of each table during iteration. If the callback returns false, + // the table will not be scanned. This option only affects Iterators and has + // no impact on point lookups. + // Default: empty (every table will be scanned) + std::function<bool(const TableProperties&)> table_filter; + + // Needed to support differential snapshots. Has 2 effects: + // 1) Iterator will skip all internal keys with seqnum < iter_start_seqnum + // 2) if this param > 0 iterator will return INTERNAL keys instead of + // user keys; e.g. return tombstones as well. + // Default: 0 (don't filter by seqnum, return user keys) + SequenceNumber iter_start_seqnum; + + // Timestamp of operation. Read should return the latest data visible to the + // specified timestamp. All timestamps of the same database must be of the + // same length and format. The user is responsible for providing a customized + // compare function via Comparator to order <key, timestamp> tuples. + // The user-specified timestamp feature is still under active development, + // and the API is subject to change. + const Slice* timestamp; + + ReadOptions(); + ReadOptions(bool cksum, bool cache); +}; + +// Options that control write operations +struct WriteOptions { + // If true, the write will be flushed from the operating system + // buffer cache (by calling WritableFile::Sync()) before the write + // is considered complete. If this flag is true, writes will be + // slower. + // + // If this flag is false, and the machine crashes, some recent + // writes may be lost. Note that if it is just the process that + // crashes (i.e., the machine does not reboot), no writes will be + // lost even if sync==false. + // + // In other words, a DB write with sync==false has similar + // crash semantics as the "write()" system call. A DB write + // with sync==true has similar crash semantics to a "write()" + // system call followed by "fdatasync()". + // + // Default: false + bool sync; + + // If true, writes will not first go to the write ahead log, + // and the write may get lost after a crash. The backup engine + // relies on write-ahead logs to back up the memtable, so if + // you disable write-ahead logs, you must create backups with + // flush_before_backup=true to avoid losing unflushed memtable data. + // Default: false + bool disableWAL; + + // If true and if user is trying to write to column families that don't exist + // (they were dropped), ignore the write (don't return an error). If there + // are multiple writes in a WriteBatch, other writes will succeed. + // Default: false + bool ignore_missing_column_families; + + // If true and we need to wait or sleep for the write request, fails + // immediately with Status::Incomplete(). + // Default: false + bool no_slowdown; + + // If true, this write request is of lower priority if compaction is + // behind. In this case, no_slowdown = true, the request will be cancelled + // immediately with Status::Incomplete() returned. Otherwise, it will be + // slowed down. The slowdown value is determined by RocksDB to guarantee + // it introduces minimum impacts to high priority writes. + // + // Default: false + bool low_pri; + + // If true, this writebatch will maintain the last insert positions of each + // memtable as hints in concurrent write. It can improve write performance + // in concurrent writes if keys in one writebatch are sequential. In + // non-concurrent writes (when concurrent_memtable_writes is false) this + // option will be ignored. + // + // Default: false + bool memtable_insert_hint_per_batch; + + // Timestamp of write operation, e.g. Put. All timestamps of the same + // database must share the same length and format. The user is also + // responsible for providing a customized compare function via Comparator to + // order <key, timestamp> tuples. If the user wants to enable timestamp, then + // all write operations must be associated with timestamp because RocksDB, as + // a single-node storage engine currently has no knowledge of global time, + // thus has to rely on the application. + // The user-specified timestamp feature is still under active development, + // and the API is subject to change. + const Slice* timestamp; + + WriteOptions() + : sync(false), + disableWAL(false), + ignore_missing_column_families(false), + no_slowdown(false), + low_pri(false), + memtable_insert_hint_per_batch(false), + timestamp(nullptr) {} +}; + +// Options that control flush operations +struct FlushOptions { + // If true, the flush will wait until the flush is done. + // Default: true + bool wait; + // If true, the flush would proceed immediately even it means writes will + // stall for the duration of the flush; if false the operation will wait + // until it's possible to do flush w/o causing stall or until required flush + // is performed by someone else (foreground call or background thread). + // Default: false + bool allow_write_stall; + FlushOptions() : wait(true), allow_write_stall(false) {} +}; + +// Create a Logger from provided DBOptions +extern Status CreateLoggerFromOptions(const std::string& dbname, + const DBOptions& options, + std::shared_ptr<Logger>* logger); + +// CompactionOptions are used in CompactFiles() call. +struct CompactionOptions { + // Compaction output compression type + // Default: snappy + // If set to `kDisableCompressionOption`, RocksDB will choose compression type + // according to the `ColumnFamilyOptions`, taking into account the output + // level if `compression_per_level` is specified. + CompressionType compression; + // Compaction will create files of size `output_file_size_limit`. + // Default: MAX, which means that compaction will create a single file + uint64_t output_file_size_limit; + // If > 0, it will replace the option in the DBOptions for this compaction. + uint32_t max_subcompactions; + + CompactionOptions() + : compression(kSnappyCompression), + output_file_size_limit(std::numeric_limits<uint64_t>::max()), + max_subcompactions(0) {} +}; + +// For level based compaction, we can configure if we want to skip/force +// bottommost level compaction. +enum class BottommostLevelCompaction { + // Skip bottommost level compaction + kSkip, + // Only compact bottommost level if there is a compaction filter + // This is the default option + kIfHaveCompactionFilter, + // Always compact bottommost level + kForce, + // Always compact bottommost level but in bottommost level avoid + // double-compacting files created in the same compaction + kForceOptimized, +}; + +// CompactRangeOptions is used by CompactRange() call. +struct CompactRangeOptions { + // If true, no other compaction will run at the same time as this + // manual compaction + bool exclusive_manual_compaction = true; + // If true, compacted files will be moved to the minimum level capable + // of holding the data or given level (specified non-negative target_level). + bool change_level = false; + // If change_level is true and target_level have non-negative value, compacted + // files will be moved to target_level. + int target_level = -1; + // Compaction outputs will be placed in options.db_paths[target_path_id]. + // Behavior is undefined if target_path_id is out of range. + uint32_t target_path_id = 0; + // By default level based compaction will only compact the bottommost level + // if there is a compaction filter + BottommostLevelCompaction bottommost_level_compaction = + BottommostLevelCompaction::kIfHaveCompactionFilter; + // If true, will execute immediately even if doing so would cause the DB to + // enter write stall mode. Otherwise, it'll sleep until load is low enough. + bool allow_write_stall = false; + // If > 0, it will replace the option in the DBOptions for this compaction. + uint32_t max_subcompactions = 0; +}; + +// IngestExternalFileOptions is used by IngestExternalFile() +struct IngestExternalFileOptions { + // Can be set to true to move the files instead of copying them. + bool move_files = false; + // If set to true, ingestion falls back to copy when move fails. + bool failed_move_fall_back_to_copy = true; + // If set to false, an ingested file keys could appear in existing snapshots + // that where created before the file was ingested. + bool snapshot_consistency = true; + // If set to false, IngestExternalFile() will fail if the file key range + // overlaps with existing keys or tombstones in the DB. + bool allow_global_seqno = true; + // If set to false and the file key range overlaps with the memtable key range + // (memtable flush required), IngestExternalFile will fail. + bool allow_blocking_flush = true; + // Set to true if you would like duplicate keys in the file being ingested + // to be skipped rather than overwriting existing data under that key. + // Usecase: back-fill of some historical data in the database without + // over-writing existing newer version of data. + // This option could only be used if the DB has been running + // with allow_ingest_behind=true since the dawn of time. + // All files will be ingested at the bottommost level with seqno=0. + bool ingest_behind = false; + // Set to true if you would like to write global_seqno to a given offset in + // the external SST file for backward compatibility. Older versions of + // RocksDB writes a global_seqno to a given offset within ingested SST files, + // and new versions of RocksDB do not. If you ingest an external SST using + // new version of RocksDB and would like to be able to downgrade to an + // older version of RocksDB, you should set 'write_global_seqno' to true. If + // your service is just starting to use the new RocksDB, we recommend that + // you set this option to false, which brings two benefits: + // 1. No extra random write for global_seqno during ingestion. + // 2. Without writing external SST file, it's possible to do checksum. + // We have a plan to set this option to false by default in the future. + bool write_global_seqno = true; + // Set to true if you would like to verify the checksums of each block of the + // external SST file before ingestion. + // Warning: setting this to true causes slowdown in file ingestion because + // the external SST file has to be read. + bool verify_checksums_before_ingest = false; + // When verify_checksums_before_ingest = true, RocksDB uses default + // readahead setting to scan the file while verifying checksums before + // ingestion. + // Users can override the default value using this option. + // Using a large readahead size (> 2MB) can typically improve the performance + // of forward iteration on spinning disks. + size_t verify_checksums_readahead_size = 0; +}; + +enum TraceFilterType : uint64_t { + // Trace all the operations + kTraceFilterNone = 0x0, + // Do not trace the get operations + kTraceFilterGet = 0x1 << 0, + // Do not trace the write operations + kTraceFilterWrite = 0x1 << 1 +}; + +// TraceOptions is used for StartTrace +struct TraceOptions { + // To avoid the trace file size grows large than the storage space, + // user can set the max trace file size in Bytes. Default is 64GB + uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024; + // Specify trace sampling option, i.e. capture one per how many requests. + // Default to 1 (capture every request). + uint64_t sampling_frequency = 1; + // Note: The filtering happens before sampling. + uint64_t filter = kTraceFilterNone; +}; + +// ImportColumnFamilyOptions is used by ImportColumnFamily() +struct ImportColumnFamilyOptions { + // Can be set to true to move the files instead of copying them. + bool move_files = false; +}; + +// Options used with DB::GetApproximateSizes() +struct SizeApproximationOptions { + // Defines whether the returned size should include the recently written + // data in the mem-tables. If set to false, include_files must be true. + bool include_memtabtles = false; + // Defines whether the returned size should include data serialized to disk. + // If set to false, include_memtabtles must be true. + bool include_files = true; + // When approximating the files total size that is used to store a keys range + // using DB::GetApproximateSizes, allow approximation with an error margin of + // up to total_files_size * files_size_error_margin. This allows to take some + // shortcuts in files size approximation, resulting in better performance, + // while guaranteeing the resulting error is within a reasonable margin. + // E.g., if the value is 0.1, then the error margin of the returned files size + // approximation will be within 10%. + // If the value is non-positive - a more precise yet more CPU intensive + // estimation is performed. + double files_size_error_margin = -1.0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/perf_context.h b/src/rocksdb/include/rocksdb/perf_context.h new file mode 100644 index 000000000..123a21bc9 --- /dev/null +++ b/src/rocksdb/include/rocksdb/perf_context.h @@ -0,0 +1,232 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <stdint.h> +#include <map> +#include <string> + +#include "rocksdb/perf_level.h" + +namespace ROCKSDB_NAMESPACE { + +// A thread local context for gathering performance counter efficiently +// and transparently. +// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats. + +// Break down performance counters by level and store per-level perf context in +// PerfContextByLevel +struct PerfContextByLevel { + // # of times bloom filter has avoided file reads, i.e., negatives. + uint64_t bloom_filter_useful = 0; + // # of times bloom FullFilter has not avoided the reads. + uint64_t bloom_filter_full_positive = 0; + // # of times bloom FullFilter has not avoided the reads and data actually + // exist. + uint64_t bloom_filter_full_true_positive = 0; + + // total number of user key returned (only include keys that are found, does + // not include keys that are deleted or merged without a final put + uint64_t user_key_return_count; + + // total nanos spent on reading data from SST files + uint64_t get_from_table_nanos; + + uint64_t block_cache_hit_count = 0; // total number of block cache hits + uint64_t block_cache_miss_count = 0; // total number of block cache misses + + void Reset(); // reset all performance counters to zero +}; + +struct PerfContext { + ~PerfContext(); + + PerfContext() {} + + PerfContext(const PerfContext&); + PerfContext& operator=(const PerfContext&); + PerfContext(PerfContext&&) noexcept; + + void Reset(); // reset all performance counters to zero + + std::string ToString(bool exclude_zero_counters = false) const; + + // enable per level perf context and allocate storage for PerfContextByLevel + void EnablePerLevelPerfContext(); + + // temporarily disable per level perf contxt by setting the flag to false + void DisablePerLevelPerfContext(); + + // free the space for PerfContextByLevel, also disable per level perf context + void ClearPerLevelPerfContext(); + + uint64_t user_key_comparison_count; // total number of user key comparisons + uint64_t block_cache_hit_count; // total number of block cache hits + uint64_t block_read_count; // total number of block reads (with IO) + uint64_t block_read_byte; // total number of bytes from block reads + uint64_t block_read_time; // total nanos spent on block reads + uint64_t block_cache_index_hit_count; // total number of index block hits + uint64_t index_block_read_count; // total number of index block reads + uint64_t block_cache_filter_hit_count; // total number of filter block hits + uint64_t filter_block_read_count; // total number of filter block reads + uint64_t compression_dict_block_read_count; // total number of compression + // dictionary block reads + uint64_t block_checksum_time; // total nanos spent on block checksum + uint64_t block_decompress_time; // total nanos spent on block decompression + + uint64_t get_read_bytes; // bytes for vals returned by Get + uint64_t multiget_read_bytes; // bytes for vals returned by MultiGet + uint64_t iter_read_bytes; // bytes for keys/vals decoded by iterator + + // total number of internal keys skipped over during iteration. + // There are several reasons for it: + // 1. when calling Next(), the iterator is in the position of the previous + // key, so that we'll need to skip it. It means this counter will always + // be incremented in Next(). + // 2. when calling Next(), we need to skip internal entries for the previous + // keys that are overwritten. + // 3. when calling Next(), Seek() or SeekToFirst(), after previous key + // before calling Next(), the seek key in Seek() or the beginning for + // SeekToFirst(), there may be one or more deleted keys before the next + // valid key that the operation should place the iterator to. We need + // to skip both of the tombstone and updates hidden by the tombstones. The + // tombstones are not included in this counter, while previous updates + // hidden by the tombstones will be included here. + // 4. symmetric cases for Prev() and SeekToLast() + // internal_recent_skipped_count is not included in this counter. + // + uint64_t internal_key_skipped_count; + // Total number of deletes and single deletes skipped over during iteration + // When calling Next(), Seek() or SeekToFirst(), after previous position + // before calling Next(), the seek key in Seek() or the beginning for + // SeekToFirst(), there may be one or more deleted keys before the next valid + // key. Every deleted key is counted once. We don't recount here if there are + // still older updates invalidated by the tombstones. + // + uint64_t internal_delete_skipped_count; + // How many times iterators skipped over internal keys that are more recent + // than the snapshot that iterator is using. + // + uint64_t internal_recent_skipped_count; + // How many values were fed into merge operator by iterators. + // + uint64_t internal_merge_count; + + uint64_t get_snapshot_time; // total nanos spent on getting snapshot + uint64_t get_from_memtable_time; // total nanos spent on querying memtables + uint64_t get_from_memtable_count; // number of mem tables queried + // total nanos spent after Get() finds a key + uint64_t get_post_process_time; + uint64_t get_from_output_files_time; // total nanos reading from output files + // total nanos spent on seeking memtable + uint64_t seek_on_memtable_time; + // number of seeks issued on memtable + // (including SeekForPrev but not SeekToFirst and SeekToLast) + uint64_t seek_on_memtable_count; + // number of Next()s issued on memtable + uint64_t next_on_memtable_count; + // number of Prev()s issued on memtable + uint64_t prev_on_memtable_count; + // total nanos spent on seeking child iters + uint64_t seek_child_seek_time; + // number of seek issued in child iterators + uint64_t seek_child_seek_count; + uint64_t seek_min_heap_time; // total nanos spent on the merge min heap + uint64_t seek_max_heap_time; // total nanos spent on the merge max heap + // total nanos spent on seeking the internal entries + uint64_t seek_internal_seek_time; + // total nanos spent on iterating internal entries to find the next user entry + uint64_t find_next_user_entry_time; + + // This group of stats provide a breakdown of time spent by Write(). + // May be inaccurate when 2PC, two_write_queues or enable_pipelined_write + // are enabled. + // + // total nanos spent on writing to WAL + uint64_t write_wal_time; + // total nanos spent on writing to mem tables + uint64_t write_memtable_time; + // total nanos spent on delaying or throttling write + uint64_t write_delay_time; + // total nanos spent on switching memtable/wal and scheduling + // flushes/compactions. + uint64_t write_scheduling_flushes_compactions_time; + // total nanos spent on writing a record, excluding the above four things + uint64_t write_pre_and_post_process_time; + + // time spent waiting for other threads of the batch group + uint64_t write_thread_wait_nanos; + + // time spent on acquiring DB mutex. + uint64_t db_mutex_lock_nanos; + // Time spent on waiting with a condition variable created with DB mutex. + uint64_t db_condition_wait_nanos; + // Time spent on merge operator. + uint64_t merge_operator_time_nanos; + + // Time spent on reading index block from block cache or SST file + uint64_t read_index_block_nanos; + // Time spent on reading filter block from block cache or SST file + uint64_t read_filter_block_nanos; + // Time spent on creating data block iterator + uint64_t new_table_block_iter_nanos; + // Time spent on creating a iterator of an SST file. + uint64_t new_table_iterator_nanos; + // Time spent on seeking a key in data/index blocks + uint64_t block_seek_nanos; + // Time spent on finding or creating a table reader + uint64_t find_table_nanos; + // total number of mem table bloom hits + uint64_t bloom_memtable_hit_count; + // total number of mem table bloom misses + uint64_t bloom_memtable_miss_count; + // total number of SST table bloom hits + uint64_t bloom_sst_hit_count; + // total number of SST table bloom misses + uint64_t bloom_sst_miss_count; + + // Time spent waiting on key locks in transaction lock manager. + uint64_t key_lock_wait_time; + // number of times acquiring a lock was blocked by another transaction. + uint64_t key_lock_wait_count; + + // Total time spent in Env filesystem operations. These are only populated + // when TimedEnv is used. + uint64_t env_new_sequential_file_nanos; + uint64_t env_new_random_access_file_nanos; + uint64_t env_new_writable_file_nanos; + uint64_t env_reuse_writable_file_nanos; + uint64_t env_new_random_rw_file_nanos; + uint64_t env_new_directory_nanos; + uint64_t env_file_exists_nanos; + uint64_t env_get_children_nanos; + uint64_t env_get_children_file_attributes_nanos; + uint64_t env_delete_file_nanos; + uint64_t env_create_dir_nanos; + uint64_t env_create_dir_if_missing_nanos; + uint64_t env_delete_dir_nanos; + uint64_t env_get_file_size_nanos; + uint64_t env_get_file_modification_time_nanos; + uint64_t env_rename_file_nanos; + uint64_t env_link_file_nanos; + uint64_t env_lock_file_nanos; + uint64_t env_unlock_file_nanos; + uint64_t env_new_logger_nanos; + + uint64_t get_cpu_nanos; + uint64_t iter_next_cpu_nanos; + uint64_t iter_prev_cpu_nanos; + uint64_t iter_seek_cpu_nanos; + + std::map<uint32_t, PerfContextByLevel>* level_to_perf_context = nullptr; + bool per_level_perf_context_enabled = false; +}; + +// Get Thread-local PerfContext object pointer +// if defined(NPERF_CONTEXT), then the pointer is not thread-local +PerfContext* get_perf_context(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/perf_level.h b/src/rocksdb/include/rocksdb/perf_level.h new file mode 100644 index 000000000..e6a768904 --- /dev/null +++ b/src/rocksdb/include/rocksdb/perf_level.h @@ -0,0 +1,35 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <stdint.h> +#include <string> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// How much perf stats to collect. Affects perf_context and iostats_context. +enum PerfLevel : unsigned char { + kUninitialized = 0, // unknown setting + kDisable = 1, // disable perf stats + kEnableCount = 2, // enable only count stats + kEnableTimeExceptForMutex = 3, // Other than count stats, also enable time + // stats except for mutexes + // Other than time, also measure CPU time counters. Still don't measure + // time (neither wall time nor CPU time) for mutexes. + kEnableTimeAndCPUTimeExceptForMutex = 4, + kEnableTime = 5, // enable count and time stats + kOutOfBounds = 6 // N.B. Must always be the last value! +}; + +// set the perf stats level for current thread +void SetPerfLevel(PerfLevel level); + +// get current perf stats level for current thread +PerfLevel GetPerfLevel(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/persistent_cache.h b/src/rocksdb/include/rocksdb/persistent_cache.h new file mode 100644 index 000000000..9651812c8 --- /dev/null +++ b/src/rocksdb/include/rocksdb/persistent_cache.h @@ -0,0 +1,67 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include <stdint.h> +#include <memory> +#include <string> + +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// PersistentCache +// +// Persistent cache interface for caching IO pages on a persistent medium. The +// cache interface is specifically designed for persistent read cache. +class PersistentCache { + public: + typedef std::vector<std::map<std::string, double>> StatsType; + + virtual ~PersistentCache() {} + + // Insert to page cache + // + // page_key Identifier to identify a page uniquely across restarts + // data Page data + // size Size of the page + virtual Status Insert(const Slice& key, const char* data, + const size_t size) = 0; + + // Lookup page cache by page identifier + // + // page_key Page identifier + // buf Buffer where the data should be copied + // size Size of the page + virtual Status Lookup(const Slice& key, std::unique_ptr<char[]>* data, + size_t* size) = 0; + + // Is cache storing uncompressed data ? + // + // True if the cache is configured to store uncompressed data else false + virtual bool IsCompressed() = 0; + + // Return stats as map of {string, double} per-tier + // + // Persistent cache can be initialized as a tier of caches. The stats are per + // tire top-down + virtual StatsType Stats() = 0; + + virtual std::string GetPrintableOptions() const = 0; +}; + +// Factor method to create a new persistent cache +Status NewPersistentCache(Env* const env, const std::string& path, + const uint64_t size, + const std::shared_ptr<Logger>& log, + const bool optimized_for_nvm, + std::shared_ptr<PersistentCache>* cache); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/rate_limiter.h b/src/rocksdb/include/rocksdb/rate_limiter.h new file mode 100644 index 000000000..0ee89f5c8 --- /dev/null +++ b/src/rocksdb/include/rocksdb/rate_limiter.h @@ -0,0 +1,139 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/env.h" +#include "rocksdb/statistics.h" + +namespace ROCKSDB_NAMESPACE { + +class RateLimiter { + public: + enum class OpType { + // Limitation: we currently only invoke Request() with OpType::kRead for + // compactions when DBOptions::new_table_reader_for_compaction_inputs is set + kRead, + kWrite, + }; + enum class Mode { + kReadsOnly, + kWritesOnly, + kAllIo, + }; + + // For API compatibility, default to rate-limiting writes only. + explicit RateLimiter(Mode mode = Mode::kWritesOnly) : mode_(mode) {} + + virtual ~RateLimiter() {} + + // This API allows user to dynamically change rate limiter's bytes per second. + // REQUIRED: bytes_per_second > 0 + virtual void SetBytesPerSecond(int64_t bytes_per_second) = 0; + + // Deprecated. New RateLimiter derived classes should override + // Request(const int64_t, const Env::IOPriority, Statistics*) or + // Request(const int64_t, const Env::IOPriority, Statistics*, OpType) + // instead. + // + // Request for token for bytes. If this request can not be satisfied, the call + // is blocked. Caller is responsible to make sure + // bytes <= GetSingleBurstBytes() + virtual void Request(const int64_t /*bytes*/, const Env::IOPriority /*pri*/) { + assert(false); + } + + // Request for token for bytes and potentially update statistics. If this + // request can not be satisfied, the call is blocked. Caller is responsible to + // make sure bytes <= GetSingleBurstBytes(). + virtual void Request(const int64_t bytes, const Env::IOPriority pri, + Statistics* /* stats */) { + // For API compatibility, default implementation calls the older API in + // which statistics are unsupported. + Request(bytes, pri); + } + + // Requests token to read or write bytes and potentially updates statistics. + // + // If this request can not be satisfied, the call is blocked. Caller is + // responsible to make sure bytes <= GetSingleBurstBytes(). + virtual void Request(const int64_t bytes, const Env::IOPriority pri, + Statistics* stats, OpType op_type) { + if (IsRateLimited(op_type)) { + Request(bytes, pri, stats); + } + } + + // Requests token to read or write bytes and potentially updates statistics. + // Takes into account GetSingleBurstBytes() and alignment (e.g., in case of + // direct I/O) to allocate an appropriate number of bytes, which may be less + // than the number of bytes requested. + virtual size_t RequestToken(size_t bytes, size_t alignment, + Env::IOPriority io_priority, Statistics* stats, + RateLimiter::OpType op_type); + + // Max bytes can be granted in a single burst + virtual int64_t GetSingleBurstBytes() const = 0; + + // Total bytes that go through rate limiter + virtual int64_t GetTotalBytesThrough( + const Env::IOPriority pri = Env::IO_TOTAL) const = 0; + + // Total # of requests that go through rate limiter + virtual int64_t GetTotalRequests( + const Env::IOPriority pri = Env::IO_TOTAL) const = 0; + + virtual int64_t GetBytesPerSecond() const = 0; + + virtual bool IsRateLimited(OpType op_type) { + if ((mode_ == RateLimiter::Mode::kWritesOnly && + op_type == RateLimiter::OpType::kRead) || + (mode_ == RateLimiter::Mode::kReadsOnly && + op_type == RateLimiter::OpType::kWrite)) { + return false; + } + return true; + } + + protected: + Mode GetMode() { return mode_; } + + private: + const Mode mode_; +}; + +// Create a RateLimiter object, which can be shared among RocksDB instances to +// control write rate of flush and compaction. +// @rate_bytes_per_sec: this is the only parameter you want to set most of the +// time. It controls the total write rate of compaction and flush in bytes per +// second. Currently, RocksDB does not enforce rate limit for anything other +// than flush and compaction, e.g. write to WAL. +// @refill_period_us: this controls how often tokens are refilled. For example, +// when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to +// 100ms, then 1MB is refilled every 100ms internally. Larger value can lead to +// burstier writes while smaller value introduces more CPU overhead. +// The default should work for most cases. +// @fairness: RateLimiter accepts high-pri requests and low-pri requests. +// A low-pri request is usually blocked in favor of hi-pri request. Currently, +// RocksDB assigns low-pri to request from compaction and high-pri to request +// from flush. Low-pri requests can get blocked if flush requests come in +// continuously. This fairness parameter grants low-pri requests permission by +// 1/fairness chance even though high-pri requests exist to avoid starvation. +// You should be good by leaving it at default 10. +// @mode: Mode indicates which types of operations count against the limit. +// @auto_tuned: Enables dynamic adjustment of rate limit within the range +// `[rate_bytes_per_sec / 20, rate_bytes_per_sec]`, according to +// the recent demand for background I/O. +extern RateLimiter* NewGenericRateLimiter( + int64_t rate_bytes_per_sec, int64_t refill_period_us = 100 * 1000, + int32_t fairness = 10, + RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly, + bool auto_tuned = false); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/rocksdb_namespace.h b/src/rocksdb/include/rocksdb/rocksdb_namespace.h new file mode 100644 index 000000000..e9f8620d0 --- /dev/null +++ b/src/rocksdb/include/rocksdb/rocksdb_namespace.h @@ -0,0 +1,10 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_NAMESPACE +#define ROCKSDB_NAMESPACE rocksdb +#endif diff --git a/src/rocksdb/include/rocksdb/slice.h b/src/rocksdb/include/rocksdb/slice.h new file mode 100644 index 000000000..c17b32c5c --- /dev/null +++ b/src/rocksdb/include/rocksdb/slice.h @@ -0,0 +1,269 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Slice is a simple structure containing a pointer into some external +// storage and a size. The user of a Slice must ensure that the slice +// is not used after the corresponding external storage has been +// deallocated. +// +// Multiple threads can invoke const methods on a Slice without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Slice must use +// external synchronization. + +#pragma once + +#include <assert.h> +#include <stddef.h> +#include <string.h> +#include <cstdio> +#include <string> + +#ifdef __cpp_lib_string_view +#include <string_view> +#endif + +#include "rocksdb/cleanable.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice { + public: + // Create an empty slice. + Slice() : data_(""), size_(0) {} + + // Create a slice that refers to d[0,n-1]. + Slice(const char* d, size_t n) : data_(d), size_(n) {} + + // Create a slice that refers to the contents of "s" + /* implicit */ + Slice(const std::string& s) : data_(s.data()), size_(s.size()) {} + +#ifdef __cpp_lib_string_view + // Create a slice that refers to the same contents as "sv" + /* implicit */ + Slice(std::string_view sv) : data_(sv.data()), size_(sv.size()) {} +#endif + + // Create a slice that refers to s[0,strlen(s)-1] + /* implicit */ + Slice(const char* s) : data_(s) { size_ = (s == nullptr) ? 0 : strlen(s); } + + // Create a single slice from SliceParts using buf as storage. + // buf must exist as long as the returned Slice exists. + Slice(const struct SliceParts& parts, std::string* buf); + + // Return a pointer to the beginning of the referenced data + const char* data() const { return data_; } + + // Return the length (in bytes) of the referenced data + size_t size() const { return size_; } + + // Return true iff the length of the referenced data is zero + bool empty() const { return size_ == 0; } + + // Return the ith byte in the referenced data. + // REQUIRES: n < size() + char operator[](size_t n) const { + assert(n < size()); + return data_[n]; + } + + // Change this slice to refer to an empty array + void clear() { + data_ = ""; + size_ = 0; + } + + // Drop the first "n" bytes from this slice. + void remove_prefix(size_t n) { + assert(n <= size()); + data_ += n; + size_ -= n; + } + + void remove_suffix(size_t n) { + assert(n <= size()); + size_ -= n; + } + + // Return a string that contains the copy of the referenced data. + // when hex is true, returns a string of twice the length hex encoded (0-9A-F) + std::string ToString(bool hex = false) const; + +#ifdef __cpp_lib_string_view + // Return a string_view that references the same data as this slice. + std::string_view ToStringView() const { + return std::string_view(data_, size_); + } +#endif + + // Decodes the current slice interpreted as an hexadecimal string into result, + // if successful returns true, if this isn't a valid hex string + // (e.g not coming from Slice::ToString(true)) DecodeHex returns false. + // This slice is expected to have an even number of 0-9A-F characters + // also accepts lowercase (a-f) + bool DecodeHex(std::string* result) const; + + // Three-way comparison. Returns value: + // < 0 iff "*this" < "b", + // == 0 iff "*this" == "b", + // > 0 iff "*this" > "b" + int compare(const Slice& b) const; + + // Return true iff "x" is a prefix of "*this" + bool starts_with(const Slice& x) const { + return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0)); + } + + bool ends_with(const Slice& x) const { + return ((size_ >= x.size_) && + (memcmp(data_ + size_ - x.size_, x.data_, x.size_) == 0)); + } + + // Compare two slices and returns the first byte where they differ + size_t difference_offset(const Slice& b) const; + + // private: make these public for rocksdbjni access + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +/** + * A Slice that can be pinned with some cleanup tasks, which will be run upon + * ::Reset() or object destruction, whichever is invoked first. This can be used + * to avoid memcpy by having the PinnableSlice object referring to the data + * that is locked in the memory and release them after the data is consumed. + */ +class PinnableSlice : public Slice, public Cleanable { + public: + PinnableSlice() { buf_ = &self_space_; } + explicit PinnableSlice(std::string* buf) { buf_ = buf; } + + PinnableSlice(PinnableSlice&& other); + PinnableSlice& operator=(PinnableSlice&& other); + + // No copy constructor and copy assignment allowed. + PinnableSlice(PinnableSlice&) = delete; + PinnableSlice& operator=(PinnableSlice&) = delete; + + inline void PinSlice(const Slice& s, CleanupFunction f, void* arg1, + void* arg2) { + assert(!pinned_); + pinned_ = true; + data_ = s.data(); + size_ = s.size(); + RegisterCleanup(f, arg1, arg2); + assert(pinned_); + } + + inline void PinSlice(const Slice& s, Cleanable* cleanable) { + assert(!pinned_); + pinned_ = true; + data_ = s.data(); + size_ = s.size(); + cleanable->DelegateCleanupsTo(this); + assert(pinned_); + } + + inline void PinSelf(const Slice& slice) { + assert(!pinned_); + buf_->assign(slice.data(), slice.size()); + data_ = buf_->data(); + size_ = buf_->size(); + assert(!pinned_); + } + + inline void PinSelf() { + assert(!pinned_); + data_ = buf_->data(); + size_ = buf_->size(); + assert(!pinned_); + } + + void remove_suffix(size_t n) { + assert(n <= size()); + if (pinned_) { + size_ -= n; + } else { + buf_->erase(size() - n, n); + PinSelf(); + } + } + + void remove_prefix(size_t n) { + assert(n <= size()); + if (pinned_) { + data_ += n; + size_ -= n; + } else { + buf_->erase(0, n); + PinSelf(); + } + } + + void Reset() { + Cleanable::Reset(); + pinned_ = false; + size_ = 0; + } + + inline std::string* GetSelf() { return buf_; } + + inline bool IsPinned() const { return pinned_; } + + private: + friend class PinnableSlice4Test; + std::string self_space_; + std::string* buf_; + bool pinned_ = false; +}; + +// A set of Slices that are virtually concatenated together. 'parts' points +// to an array of Slices. The number of elements in the array is 'num_parts'. +struct SliceParts { + SliceParts(const Slice* _parts, int _num_parts) + : parts(_parts), num_parts(_num_parts) {} + SliceParts() : parts(nullptr), num_parts(0) {} + + const Slice* parts; + int num_parts; +}; + +inline bool operator==(const Slice& x, const Slice& y) { + return ((x.size() == y.size()) && + (memcmp(x.data(), y.data(), x.size()) == 0)); +} + +inline bool operator!=(const Slice& x, const Slice& y) { return !(x == y); } + +inline int Slice::compare(const Slice& b) const { + assert(data_ != nullptr && b.data_ != nullptr); + const size_t min_len = (size_ < b.size_) ? size_ : b.size_; + int r = memcmp(data_, b.data_, min_len); + if (r == 0) { + if (size_ < b.size_) + r = -1; + else if (size_ > b.size_) + r = +1; + } + return r; +} + +inline size_t Slice::difference_offset(const Slice& b) const { + size_t off = 0; + const size_t len = (size_ < b.size_) ? size_ : b.size_; + for (; off < len; off++) { + if (data_[off] != b.data_[off]) break; + } + return off; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/slice_transform.h b/src/rocksdb/include/rocksdb/slice_transform.h new file mode 100644 index 000000000..54f61f9d2 --- /dev/null +++ b/src/rocksdb/include/rocksdb/slice_transform.h @@ -0,0 +1,103 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Class for specifying user-defined functions which perform a +// transformation on a slice. It is not required that every slice +// belong to the domain and/or range of a function. Subclasses should +// define InDomain and InRange to determine which slices are in either +// of these sets respectively. + +#pragma once + +#include <string> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +/* + * A SliceTransform is a generic pluggable way of transforming one string + * to another. Its primary use-case is in configuring rocksdb + * to store prefix blooms by setting prefix_extractor in + * ColumnFamilyOptions. + */ +class SliceTransform { + public: + virtual ~SliceTransform(){}; + + // Return the name of this transformation. + virtual const char* Name() const = 0; + + // Extract a prefix from a specified key. This method is called when + // a key is inserted into the db, and the returned slice is used to + // create a bloom filter. + virtual Slice Transform(const Slice& key) const = 0; + + // Determine whether the specified key is compatible with the logic + // specified in the Transform method. This method is invoked for every + // key that is inserted into the db. If this method returns true, + // then Transform is called to translate the key to its prefix and + // that returned prefix is inserted into the bloom filter. If this + // method returns false, then the call to Transform is skipped and + // no prefix is inserted into the bloom filters. + // + // For example, if the Transform method operates on a fixed length + // prefix of size 4, then an invocation to InDomain("abc") returns + // false because the specified key length(3) is shorter than the + // prefix size of 4. + // + // Wiki documentation here: + // https://github.com/facebook/rocksdb/wiki/Prefix-Seek-API-Changes + // + virtual bool InDomain(const Slice& key) const = 0; + + // This is currently not used and remains here for backward compatibility. + virtual bool InRange(const Slice& /*dst*/) const { return false; } + + // Some SliceTransform will have a full length which can be used to + // determine if two keys are consecuitive. Can be disabled by always + // returning 0 + virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; } + + // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix. + // + // This function is not used by RocksDB, but for users. If users pass + // Options by string to RocksDB, they might not know what prefix extractor + // they are using. This function is to help users can determine: + // if they want to iterate all keys prefixing `prefix`, whether it is + // safe to use prefix bloom filter and seek to key `prefix`. + // If this function returns true, this means a user can Seek() to a prefix + // using the bloom filter. Otherwise, user needs to skip the bloom filter + // by setting ReadOptions.total_order_seek = true. + // + // Here is an example: Suppose we implement a slice transform that returns + // the first part of the string after splitting it using delimiter ",": + // 1. SameResultWhenAppended("abc,") should return true. If applying prefix + // bloom filter using it, all slices matching "abc:.*" will be extracted + // to "abc,", so any SST file or memtable containing any of those key + // will not be filtered out. + // 2. SameResultWhenAppended("abc") should return false. A user will not + // guaranteed to see all the keys matching "abc.*" if a user seek to "abc" + // against a DB with the same setting. If one SST file only contains + // "abcd,e", the file can be filtered out and the key will be invisible. + // + // i.e., an implementation always returning false is safe. + virtual bool SameResultWhenAppended(const Slice& /*prefix*/) const { + return false; + } +}; + +extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len); + +extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len); + +extern const SliceTransform* NewNoopTransform(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/snapshot.h b/src/rocksdb/include/rocksdb/snapshot.h new file mode 100644 index 000000000..6a7212d60 --- /dev/null +++ b/src/rocksdb/include/rocksdb/snapshot.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class DB; + +// Abstract handle to particular state of a DB. +// A Snapshot is an immutable object and can therefore be safely +// accessed from multiple threads without any external synchronization. +// +// To Create a Snapshot, call DB::GetSnapshot(). +// To Destroy a Snapshot, call DB::ReleaseSnapshot(snapshot). +class Snapshot { + public: + // returns Snapshot's sequence number + virtual SequenceNumber GetSequenceNumber() const = 0; + + protected: + virtual ~Snapshot(); +}; + +// Simple RAII wrapper class for Snapshot. +// Constructing this object will create a snapshot. Destructing will +// release the snapshot. +class ManagedSnapshot { + public: + explicit ManagedSnapshot(DB* db); + + // Instead of creating a snapshot, take ownership of the input snapshot. + ManagedSnapshot(DB* db, const Snapshot* _snapshot); + + ~ManagedSnapshot(); + + const Snapshot* snapshot(); + + private: + DB* db_; + const Snapshot* snapshot_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/sst_dump_tool.h b/src/rocksdb/include/rocksdb/sst_dump_tool.h new file mode 100644 index 000000000..ecb692e31 --- /dev/null +++ b/src/rocksdb/include/rocksdb/sst_dump_tool.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#ifndef ROCKSDB_LITE +#pragma once + +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { + +class SSTDumpTool { + public: + int Run(int argc, char** argv, Options options = Options()); +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/sst_file_manager.h b/src/rocksdb/include/rocksdb/sst_file_manager.h new file mode 100644 index 000000000..92d0bbbf8 --- /dev/null +++ b/src/rocksdb/include/rocksdb/sst_file_manager.h @@ -0,0 +1,132 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <memory> +#include <string> +#include <unordered_map> +#include <vector> + +#include "rocksdb/file_system.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Env; +class Logger; + +// SstFileManager is used to track SST files in the DB and control their +// deletion rate. +// All SstFileManager public functions are thread-safe. +// SstFileManager is not extensible. +class SstFileManager { + public: + virtual ~SstFileManager() {} + + // Update the maximum allowed space that should be used by RocksDB, if + // the total size of the SST files exceeds max_allowed_space, writes to + // RocksDB will fail. + // + // Setting max_allowed_space to 0 will disable this feature; maximum allowed + // space will be infinite (Default value). + // + // thread-safe. + virtual void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) = 0; + + // Set the amount of buffer room each compaction should be able to leave. + // In other words, at its maximum disk space consumption, the compaction + // should still leave compaction_buffer_size available on the disk so that + // other background functions may continue, such as logging and flushing. + virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0; + + // Return true if the total size of SST files exceeded the maximum allowed + // space usage. + // + // thread-safe. + virtual bool IsMaxAllowedSpaceReached() = 0; + + // Returns true if the total size of SST files as well as estimated size + // of ongoing compactions exceeds the maximums allowed space usage. + virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0; + + // Return the total size of all tracked files. + // thread-safe + virtual uint64_t GetTotalSize() = 0; + + // Return a map containing all tracked files and their corresponding sizes. + // thread-safe + virtual std::unordered_map<std::string, uint64_t> GetTrackedFiles() = 0; + + // Return delete rate limit in bytes per second. + // thread-safe + virtual int64_t GetDeleteRateBytesPerSecond() = 0; + + // Update the delete rate limit in bytes per second. + // zero means disable delete rate limiting and delete files immediately + // thread-safe + virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) = 0; + + // Return trash/DB size ratio where new files will be deleted immediately + // thread-safe + virtual double GetMaxTrashDBRatio() = 0; + + // Update trash/DB size ratio where new files will be deleted immediately + // thread-safe + virtual void SetMaxTrashDBRatio(double ratio) = 0; + + // Return the total size of trash files + // thread-safe + virtual uint64_t GetTotalTrashSize() = 0; +}; + +// Create a new SstFileManager that can be shared among multiple RocksDB +// instances to track SST file and control there deletion rate. +// Even though SstFileManager don't track WAL files but it still control +// there deletion rate. +// +// @param env: Pointer to Env object, please see "rocksdb/env.h". +// @param fs: Pointer to FileSystem object (rocksdb/file_system.h" +// @param info_log: If not nullptr, info_log will be used to log errors. +// +// == Deletion rate limiting specific arguments == +// @param trash_dir: Deprecated, this argument have no effect +// @param rate_bytes_per_sec: How many bytes should be deleted per second, If +// this value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb +// in 1 second, we will wait for another 3 seconds before we delete other +// files, Set to 0 to disable deletion rate limiting. +// This option also affect the delete rate of WAL files in the DB. +// @param delete_existing_trash: Deprecated, this argument have no effect, but +// if user provide trash_dir we will schedule deletes for files in the dir +// @param status: If not nullptr, status will contain any errors that happened +// during creating the missing trash_dir or deleting existing files in trash. +// @param max_trash_db_ratio: If the trash size constitutes for more than this +// fraction of the total DB size we will start deleting new files passed to +// DeleteScheduler immediately +// @param bytes_max_delete_chunk: if a file to delete is larger than delete +// chunk, ftruncate the file by this size each time, rather than dropping the +// whole file. 0 means to always delete the whole file. If the file has more +// than one linked names, the file will be deleted as a whole. Either way, +// `rate_bytes_per_sec` will be appreciated. NOTE that with this option, +// files already renamed as a trash may be partial, so users should not +// directly recover them without checking. +extern SstFileManager* NewSstFileManager( + Env* env, std::shared_ptr<FileSystem> fs, + std::shared_ptr<Logger> info_log = nullptr, + const std::string& trash_dir = "", int64_t rate_bytes_per_sec = 0, + bool delete_existing_trash = true, Status* status = nullptr, + double max_trash_db_ratio = 0.25, + uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024); + +// Same as above, but takes a pointer to a legacy Env object, instead of +// Env and FileSystem objects +extern SstFileManager* NewSstFileManager( + Env* env, std::shared_ptr<Logger> info_log = nullptr, + std::string trash_dir = "", int64_t rate_bytes_per_sec = 0, + bool delete_existing_trash = true, Status* status = nullptr, + double max_trash_db_ratio = 0.25, + uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/sst_file_reader.h b/src/rocksdb/include/rocksdb/sst_file_reader.h new file mode 100644 index 000000000..4b8642480 --- /dev/null +++ b/src/rocksdb/include/rocksdb/sst_file_reader.h @@ -0,0 +1,47 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +// SstFileReader is used to read sst files that are generated by DB or +// SstFileWriter. +class SstFileReader { + public: + SstFileReader(const Options& options); + + ~SstFileReader(); + + // Prepares to read from the file located at "file_path". + Status Open(const std::string& file_path); + + // Returns a new iterator over the table contents. + // Most read options provide the same control as we read from DB. + // If "snapshot" is nullptr, the iterator returns only the latest keys. + Iterator* NewIterator(const ReadOptions& options); + + std::shared_ptr<const TableProperties> GetTableProperties() const; + + // Verifies whether there is corruption in this table. + Status VerifyChecksum(const ReadOptions& /*read_options*/); + + Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } + + private: + struct Rep; + std::unique_ptr<Rep> rep_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/sst_file_writer.h b/src/rocksdb/include/rocksdb/sst_file_writer.h new file mode 100644 index 000000000..e83383fea --- /dev/null +++ b/src/rocksdb/include/rocksdb/sst_file_writer.h @@ -0,0 +1,139 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <memory> +#include <string> + +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/types.h" + +#if defined(__GNUC__) || defined(__clang__) +#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__)) +#elif _WIN32 +#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated) +#endif + +namespace ROCKSDB_NAMESPACE { + +class Comparator; + +// ExternalSstFileInfo include information about sst files created +// using SstFileWriter. +struct ExternalSstFileInfo { + ExternalSstFileInfo() + : file_path(""), + smallest_key(""), + largest_key(""), + smallest_range_del_key(""), + largest_range_del_key(""), + sequence_number(0), + file_size(0), + num_entries(0), + num_range_del_entries(0), + version(0) {} + + ExternalSstFileInfo(const std::string& _file_path, + const std::string& _smallest_key, + const std::string& _largest_key, + SequenceNumber _sequence_number, uint64_t _file_size, + int32_t _num_entries, int32_t _version) + : file_path(_file_path), + smallest_key(_smallest_key), + largest_key(_largest_key), + smallest_range_del_key(""), + largest_range_del_key(""), + sequence_number(_sequence_number), + file_size(_file_size), + num_entries(_num_entries), + num_range_del_entries(0), + version(_version) {} + + std::string file_path; // external sst file path + std::string smallest_key; // smallest user key in file + std::string largest_key; // largest user key in file + std::string + smallest_range_del_key; // smallest range deletion user key in file + std::string largest_range_del_key; // largest range deletion user key in file + SequenceNumber sequence_number; // sequence number of all keys in file + uint64_t file_size; // file size in bytes + uint64_t num_entries; // number of entries in file + uint64_t num_range_del_entries; // number of range deletion entries in file + int32_t version; // file version +}; + +// SstFileWriter is used to create sst files that can be added to database later +// All keys in files generated by SstFileWriter will have sequence number = 0. +class SstFileWriter { + public: + // User can pass `column_family` to specify that the generated file will + // be ingested into this column_family, note that passing nullptr means that + // the column_family is unknown. + // If invalidate_page_cache is set to true, SstFileWriter will give the OS a + // hint that this file pages is not needed every time we write 1MB to the + // file. To use the rate limiter an io_priority smaller than IO_TOTAL can be + // passed. + SstFileWriter(const EnvOptions& env_options, const Options& options, + ColumnFamilyHandle* column_family = nullptr, + bool invalidate_page_cache = true, + Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL, + bool skip_filters = false) + : SstFileWriter(env_options, options, options.comparator, column_family, + invalidate_page_cache, io_priority, skip_filters) {} + + // Deprecated API + SstFileWriter(const EnvOptions& env_options, const Options& options, + const Comparator* user_comparator, + ColumnFamilyHandle* column_family = nullptr, + bool invalidate_page_cache = true, + Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL, + bool skip_filters = false); + + ~SstFileWriter(); + + // Prepare SstFileWriter to write into file located at "file_path". + Status Open(const std::string& file_path); + + // Add a Put key with value to currently opened file (deprecated) + // REQUIRES: key is after any previously added key according to comparator. + ROCKSDB_DEPRECATED_FUNC Status Add(const Slice& user_key, const Slice& value); + + // Add a Put key with value to currently opened file + // REQUIRES: key is after any previously added key according to comparator. + Status Put(const Slice& user_key, const Slice& value); + + // Add a Merge key with value to currently opened file + // REQUIRES: key is after any previously added key according to comparator. + Status Merge(const Slice& user_key, const Slice& value); + + // Add a deletion key to currently opened file + // REQUIRES: key is after any previously added key according to comparator. + Status Delete(const Slice& user_key); + + // Add a range deletion tombstone to currently opened file + Status DeleteRange(const Slice& begin_key, const Slice& end_key); + + // Finalize writing to sst file and close file. + // + // An optional ExternalSstFileInfo pointer can be passed to the function + // which will be populated with information about the created sst file. + Status Finish(ExternalSstFileInfo* file_info = nullptr); + + // Return the current file size. + uint64_t FileSize(); + + private: + void InvalidatePageCache(bool closing); + struct Rep; + std::unique_ptr<Rep> rep_; +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/statistics.h b/src/rocksdb/include/rocksdb/statistics.h new file mode 100644 index 000000000..3bda6d718 --- /dev/null +++ b/src/rocksdb/include/rocksdb/statistics.h @@ -0,0 +1,548 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <atomic> +#include <cstddef> +#include <cstdint> +#include <map> +#include <memory> +#include <string> +#include <vector> + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +/** + * Keep adding ticker's here. + * 1. Any ticker should be added before TICKER_ENUM_MAX. + * 2. Add a readable string in TickersNameMap below for the newly added ticker. + * 3. Add a corresponding enum value to TickerType.java in the java API + * 4. Add the enum conversions from Java and C++ to portal.h's toJavaTickerType + * and toCppTickers + */ +enum Tickers : uint32_t { + // total block cache misses + // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS + + // BLOCK_CACHE_FILTER_MISS + + // BLOCK_CACHE_DATA_MISS; + BLOCK_CACHE_MISS = 0, + // total block cache hit + // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT + + // BLOCK_CACHE_FILTER_HIT + + // BLOCK_CACHE_DATA_HIT; + BLOCK_CACHE_HIT, + // # of blocks added to block cache. + BLOCK_CACHE_ADD, + // # of failures when adding blocks to block cache. + BLOCK_CACHE_ADD_FAILURES, + // # of times cache miss when accessing index block from block cache. + BLOCK_CACHE_INDEX_MISS, + // # of times cache hit when accessing index block from block cache. + BLOCK_CACHE_INDEX_HIT, + // # of index blocks added to block cache. + BLOCK_CACHE_INDEX_ADD, + // # of bytes of index blocks inserted into cache + BLOCK_CACHE_INDEX_BYTES_INSERT, + // # of bytes of index block erased from cache + BLOCK_CACHE_INDEX_BYTES_EVICT, + // # of times cache miss when accessing filter block from block cache. + BLOCK_CACHE_FILTER_MISS, + // # of times cache hit when accessing filter block from block cache. + BLOCK_CACHE_FILTER_HIT, + // # of filter blocks added to block cache. + BLOCK_CACHE_FILTER_ADD, + // # of bytes of bloom filter blocks inserted into cache + BLOCK_CACHE_FILTER_BYTES_INSERT, + // # of bytes of bloom filter block erased from cache + BLOCK_CACHE_FILTER_BYTES_EVICT, + // # of times cache miss when accessing data block from block cache. + BLOCK_CACHE_DATA_MISS, + // # of times cache hit when accessing data block from block cache. + BLOCK_CACHE_DATA_HIT, + // # of data blocks added to block cache. + BLOCK_CACHE_DATA_ADD, + // # of bytes of data blocks inserted into cache + BLOCK_CACHE_DATA_BYTES_INSERT, + // # of bytes read from cache. + BLOCK_CACHE_BYTES_READ, + // # of bytes written into cache. + BLOCK_CACHE_BYTES_WRITE, + + // # of times bloom filter has avoided file reads, i.e., negatives. + BLOOM_FILTER_USEFUL, + // # of times bloom FullFilter has not avoided the reads. + BLOOM_FILTER_FULL_POSITIVE, + // # of times bloom FullFilter has not avoided the reads and data actually + // exist. + BLOOM_FILTER_FULL_TRUE_POSITIVE, + + BLOOM_FILTER_MICROS, + + // # persistent cache hit + PERSISTENT_CACHE_HIT, + // # persistent cache miss + PERSISTENT_CACHE_MISS, + + // # total simulation block cache hits + SIM_BLOCK_CACHE_HIT, + // # total simulation block cache misses + SIM_BLOCK_CACHE_MISS, + + // # of memtable hits. + MEMTABLE_HIT, + // # of memtable misses. + MEMTABLE_MISS, + + // # of Get() queries served by L0 + GET_HIT_L0, + // # of Get() queries served by L1 + GET_HIT_L1, + // # of Get() queries served by L2 and up + GET_HIT_L2_AND_UP, + + /** + * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction + * There are 4 reasons currently. + */ + COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value. + // Also includes keys dropped for range del. + COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete. + COMPACTION_KEY_DROP_RANGE_DEL, // key was covered by a range tombstone. + COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key. + COMPACTION_RANGE_DEL_DROP_OBSOLETE, // all keys in range were deleted. + // Deletions obsoleted before bottom level due to file gap optimization. + COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, + // If a compaction was cancelled in sfm to prevent ENOSPC + COMPACTION_CANCELLED, + + // Number of keys written to the database via the Put and Write call's + NUMBER_KEYS_WRITTEN, + // Number of Keys read, + NUMBER_KEYS_READ, + // Number keys updated, if inplace update is enabled + NUMBER_KEYS_UPDATED, + // The number of uncompressed bytes issued by DB::Put(), DB::Delete(), + // DB::Merge(), and DB::Write(). + BYTES_WRITTEN, + // The number of uncompressed bytes read from DB::Get(). It could be + // either from memtables, cache, or table files. + // For the number of logical bytes read from DB::MultiGet(), + // please use NUMBER_MULTIGET_BYTES_READ. + BYTES_READ, + // The number of calls to seek/next/prev + NUMBER_DB_SEEK, + NUMBER_DB_NEXT, + NUMBER_DB_PREV, + // The number of calls to seek/next/prev that returned data + NUMBER_DB_SEEK_FOUND, + NUMBER_DB_NEXT_FOUND, + NUMBER_DB_PREV_FOUND, + // The number of uncompressed bytes read from an iterator. + // Includes size of key and value. + ITER_BYTES_READ, + NO_FILE_CLOSES, + NO_FILE_OPENS, + NO_FILE_ERRORS, + // DEPRECATED Time system had to wait to do LO-L1 compactions + STALL_L0_SLOWDOWN_MICROS, + // DEPRECATED Time system had to wait to move memtable to L1. + STALL_MEMTABLE_COMPACTION_MICROS, + // DEPRECATED write throttle because of too many files in L0 + STALL_L0_NUM_FILES_MICROS, + // Writer has to wait for compaction or flush to finish. + STALL_MICROS, + // The wait time for db mutex. + // Disabled by default. To enable it set stats level to kAll + DB_MUTEX_WAIT_MICROS, + RATE_LIMIT_DELAY_MILLIS, + // DEPRECATED number of iterators currently open + NO_ITERATORS, + + // Number of MultiGet calls, keys read, and bytes read + NUMBER_MULTIGET_CALLS, + NUMBER_MULTIGET_KEYS_READ, + NUMBER_MULTIGET_BYTES_READ, + + // Number of deletes records that were not required to be + // written to storage because key does not exist + NUMBER_FILTERED_DELETES, + NUMBER_MERGE_FAILURES, + + // number of times bloom was checked before creating iterator on a + // file, and the number of times the check was useful in avoiding + // iterator creation (and thus likely IOPs). + BLOOM_FILTER_PREFIX_CHECKED, + BLOOM_FILTER_PREFIX_USEFUL, + + // Number of times we had to reseek inside an iteration to skip + // over large number of keys with same userkey. + NUMBER_OF_RESEEKS_IN_ITERATION, + + // Record the number of calls to GetUpadtesSince. Useful to keep track of + // transaction log iterator refreshes + GET_UPDATES_SINCE_CALLS, + BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache + BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache + // Number of blocks added to compressed block cache + BLOCK_CACHE_COMPRESSED_ADD, + // Number of failures when adding blocks to compressed block cache + BLOCK_CACHE_COMPRESSED_ADD_FAILURES, + WAL_FILE_SYNCED, // Number of times WAL sync is done + WAL_FILE_BYTES, // Number of bytes written to WAL + + // Writes can be processed by requesting thread or by the thread at the + // head of the writers queue. + WRITE_DONE_BY_SELF, + WRITE_DONE_BY_OTHER, // Equivalent to writes done for others + WRITE_TIMEDOUT, // Number of writes ending up with timed-out. + WRITE_WITH_WAL, // Number of Write calls that request WAL + COMPACT_READ_BYTES, // Bytes read during compaction + COMPACT_WRITE_BYTES, // Bytes written during compaction + FLUSH_WRITE_BYTES, // Bytes written during flush + + // Number of table's properties loaded directly from file, without creating + // table reader object. + NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, + NUMBER_SUPERVERSION_ACQUIRES, + NUMBER_SUPERVERSION_RELEASES, + NUMBER_SUPERVERSION_CLEANUPS, + + // # of compressions/decompressions executed + NUMBER_BLOCK_COMPRESSED, + NUMBER_BLOCK_DECOMPRESSED, + + NUMBER_BLOCK_NOT_COMPRESSED, + MERGE_OPERATION_TOTAL_TIME, + FILTER_OPERATION_TOTAL_TIME, + + // Row cache. + ROW_CACHE_HIT, + ROW_CACHE_MISS, + + // Read amplification statistics. + // Read amplification can be calculated using this formula + // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES) + // + // REQUIRES: ReadOptions::read_amp_bytes_per_bit to be enabled + READ_AMP_ESTIMATE_USEFUL_BYTES, // Estimate of total bytes actually used. + READ_AMP_TOTAL_READ_BYTES, // Total size of loaded data blocks. + + // Number of refill intervals where rate limiter's bytes are fully consumed. + NUMBER_RATE_LIMITER_DRAINS, + + // Number of internal keys skipped by Iterator + NUMBER_ITER_SKIP, + + // BlobDB specific stats + // # of Put/PutTTL/PutUntil to BlobDB. + BLOB_DB_NUM_PUT, + // # of Write to BlobDB. + BLOB_DB_NUM_WRITE, + // # of Get to BlobDB. + BLOB_DB_NUM_GET, + // # of MultiGet to BlobDB. + BLOB_DB_NUM_MULTIGET, + // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator. + BLOB_DB_NUM_SEEK, + // # of Next to BlobDB iterator. + BLOB_DB_NUM_NEXT, + // # of Prev to BlobDB iterator. + BLOB_DB_NUM_PREV, + // # of keys written to BlobDB. + BLOB_DB_NUM_KEYS_WRITTEN, + // # of keys read from BlobDB. + BLOB_DB_NUM_KEYS_READ, + // # of bytes (key + value) written to BlobDB. + BLOB_DB_BYTES_WRITTEN, + // # of bytes (keys + value) read from BlobDB. + BLOB_DB_BYTES_READ, + // # of keys written by BlobDB as non-TTL inlined value. + BLOB_DB_WRITE_INLINED, + // # of keys written by BlobDB as TTL inlined value. + BLOB_DB_WRITE_INLINED_TTL, + // # of keys written by BlobDB as non-TTL blob value. + BLOB_DB_WRITE_BLOB, + // # of keys written by BlobDB as TTL blob value. + BLOB_DB_WRITE_BLOB_TTL, + // # of bytes written to blob file. + BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + // # of bytes read from blob file. + BLOB_DB_BLOB_FILE_BYTES_READ, + // # of times a blob files being synced. + BLOB_DB_BLOB_FILE_SYNCED, + // # of blob index evicted from base DB by BlobDB compaction filter because + // of expiration. + BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, + // size of blob index evicted from base DB by BlobDB compaction filter + // because of expiration. + BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, + // # of blob index evicted from base DB by BlobDB compaction filter because + // of corresponding file deleted. + BLOB_DB_BLOB_INDEX_EVICTED_COUNT, + // size of blob index evicted from base DB by BlobDB compaction filter + // because of corresponding file deleted. + BLOB_DB_BLOB_INDEX_EVICTED_SIZE, + // # of blob files that were obsoleted by garbage collection. + BLOB_DB_GC_NUM_FILES, + // # of blob files generated by garbage collection. + BLOB_DB_GC_NUM_NEW_FILES, + // # of BlobDB garbage collection failures. + BLOB_DB_GC_FAILURES, + // # of keys dropped by BlobDB garbage collection because they had been + // overwritten. DEPRECATED. + BLOB_DB_GC_NUM_KEYS_OVERWRITTEN, + // # of keys dropped by BlobDB garbage collection because of expiration. + // DEPRECATED. + BLOB_DB_GC_NUM_KEYS_EXPIRED, + // # of keys relocated to new blob file by garbage collection. + BLOB_DB_GC_NUM_KEYS_RELOCATED, + // # of bytes dropped by BlobDB garbage collection because they had been + // overwritten. DEPRECATED. + BLOB_DB_GC_BYTES_OVERWRITTEN, + // # of bytes dropped by BlobDB garbage collection because of expiration. + // DEPRECATED. + BLOB_DB_GC_BYTES_EXPIRED, + // # of bytes relocated to new blob file by garbage collection. + BLOB_DB_GC_BYTES_RELOCATED, + // # of blob files evicted because of BlobDB is full. + BLOB_DB_FIFO_NUM_FILES_EVICTED, + // # of keys in the blob files evicted because of BlobDB is full. + BLOB_DB_FIFO_NUM_KEYS_EVICTED, + // # of bytes in the blob files evicted because of BlobDB is full. + BLOB_DB_FIFO_BYTES_EVICTED, + + // These counters indicate a performance issue in WritePrepared transactions. + // We should not seem them ticking them much. + // # of times prepare_mutex_ is acquired in the fast path. + TXN_PREPARE_MUTEX_OVERHEAD, + // # of times old_commit_map_mutex_ is acquired in the fast path. + TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD, + // # of times we checked a batch for duplicate keys. + TXN_DUPLICATE_KEY_OVERHEAD, + // # of times snapshot_mutex_ is acquired in the fast path. + TXN_SNAPSHOT_MUTEX_OVERHEAD, + // # of times ::Get returned TryAgain due to expired snapshot seq + TXN_GET_TRY_AGAIN, + + // Number of keys actually found in MultiGet calls (vs number requested by + // caller) + // NUMBER_MULTIGET_KEYS_READ gives the number requested by caller + NUMBER_MULTIGET_KEYS_FOUND, + + NO_ITERATOR_CREATED, // number of iterators created + NO_ITERATOR_DELETED, // number of iterators deleted + + BLOCK_CACHE_COMPRESSION_DICT_MISS, + BLOCK_CACHE_COMPRESSION_DICT_HIT, + BLOCK_CACHE_COMPRESSION_DICT_ADD, + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, + BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT, + TICKER_ENUM_MAX +}; + +// The order of items listed in Tickers should be the same as +// the order listed in TickersNameMap +extern const std::vector<std::pair<Tickers, std::string>> TickersNameMap; + +/** + * Keep adding histogram's here. + * Any histogram should have value less than HISTOGRAM_ENUM_MAX + * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX + * Add a string representation in HistogramsNameMap below + * And increment HISTOGRAM_ENUM_MAX + * Add a corresponding enum value to HistogramType.java in the java API + */ +enum Histograms : uint32_t { + DB_GET = 0, + DB_WRITE, + COMPACTION_TIME, + COMPACTION_CPU_TIME, + SUBCOMPACTION_SETUP_TIME, + TABLE_SYNC_MICROS, + COMPACTION_OUTFILE_SYNC_MICROS, + WAL_FILE_SYNC_MICROS, + MANIFEST_FILE_SYNC_MICROS, + // TIME SPENT IN IO DURING TABLE OPEN + TABLE_OPEN_IO_MICROS, + DB_MULTIGET, + READ_BLOCK_COMPACTION_MICROS, + READ_BLOCK_GET_MICROS, + WRITE_RAW_BLOCK_MICROS, + STALL_L0_SLOWDOWN_COUNT, + STALL_MEMTABLE_COMPACTION_COUNT, + STALL_L0_NUM_FILES_COUNT, + HARD_RATE_LIMIT_DELAY_COUNT, + SOFT_RATE_LIMIT_DELAY_COUNT, + NUM_FILES_IN_SINGLE_COMPACTION, + DB_SEEK, + WRITE_STALL, + SST_READ_MICROS, + // The number of subcompactions actually scheduled during a compaction + NUM_SUBCOMPACTIONS_SCHEDULED, + // Value size distribution in each operation + BYTES_PER_READ, + BYTES_PER_WRITE, + BYTES_PER_MULTIGET, + + // number of bytes compressed/decompressed + // number of bytes is when uncompressed; i.e. before/after respectively + BYTES_COMPRESSED, + BYTES_DECOMPRESSED, + COMPRESSION_TIMES_NANOS, + DECOMPRESSION_TIMES_NANOS, + // Number of merge operands passed to the merge operator in user read + // requests. + READ_NUM_MERGE_OPERANDS, + + // BlobDB specific stats + // Size of keys written to BlobDB. + BLOB_DB_KEY_SIZE, + // Size of values written to BlobDB. + BLOB_DB_VALUE_SIZE, + // BlobDB Put/PutWithTTL/PutUntil/Write latency. + BLOB_DB_WRITE_MICROS, + // BlobDB Get lagency. + BLOB_DB_GET_MICROS, + // BlobDB MultiGet latency. + BLOB_DB_MULTIGET_MICROS, + // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency. + BLOB_DB_SEEK_MICROS, + // BlobDB Next latency. + BLOB_DB_NEXT_MICROS, + // BlobDB Prev latency. + BLOB_DB_PREV_MICROS, + // Blob file write latency. + BLOB_DB_BLOB_FILE_WRITE_MICROS, + // Blob file read latency. + BLOB_DB_BLOB_FILE_READ_MICROS, + // Blob file sync latency. + BLOB_DB_BLOB_FILE_SYNC_MICROS, + // BlobDB garbage collection time. DEPRECATED. + BLOB_DB_GC_MICROS, + // BlobDB compression time. + BLOB_DB_COMPRESSION_MICROS, + // BlobDB decompression time. + BLOB_DB_DECOMPRESSION_MICROS, + // Time spent flushing memtable to disk + FLUSH_TIME, + SST_BATCH_SIZE, + + HISTOGRAM_ENUM_MAX, +}; + +extern const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap; + +struct HistogramData { + double median; + double percentile95; + double percentile99; + double average; + double standard_deviation; + // zero-initialize new members since old Statistics::histogramData() + // implementations won't write them. + double max = 0.0; + uint64_t count = 0; + uint64_t sum = 0; + double min = 0.0; +}; + +// StatsLevel can be used to reduce statistics overhead by skipping certain +// types of stats in the stats collection process. +// Usage: +// options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex); +enum StatsLevel : uint8_t { + // Disable timer stats, and skip histogram stats + kExceptHistogramOrTimers, + // Skip timer stats + kExceptTimers, + // Collect all stats except time inside mutex lock AND time spent on + // compression. + kExceptDetailedTimers, + // Collect all stats except the counters requiring to get time inside the + // mutex lock. + kExceptTimeForMutex, + // Collect all stats, including measuring duration of mutex operations. + // If getting time is expensive on the platform to run, it can + // reduce scalability to more threads, especially for writes. + kAll, +}; + +// Analyze the performance of a db by providing cumulative stats over time. +// Usage: +// Options options; +// options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); +// Status s = DB::Open(options, kDBPath, &db); +// ... +// options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); +// HistogramData hist; +// options.statistics->histogramData(FLUSH_TIME, &hist); +class Statistics { + public: + virtual ~Statistics() {} + static const char* Type() { return "Statistics"; } + virtual uint64_t getTickerCount(uint32_t tickerType) const = 0; + virtual void histogramData(uint32_t type, + HistogramData* const data) const = 0; + virtual std::string getHistogramString(uint32_t /*type*/) const { return ""; } + virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0; + virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0; + virtual uint64_t getAndResetTickerCount(uint32_t tickerType) = 0; + virtual void reportTimeToHistogram(uint32_t histogramType, uint64_t time) { + if (get_stats_level() <= StatsLevel::kExceptTimers) { + return; + } + recordInHistogram(histogramType, time); + } + // The function is here only for backward compatibility reason. + // Users implementing their own Statistics class should override + // recordInHistogram() instead and leave measureTime() as it is. + virtual void measureTime(uint32_t /*histogramType*/, uint64_t /*time*/) { + // This is not supposed to be called. + assert(false); + } + virtual void recordInHistogram(uint32_t histogramType, uint64_t time) { + // measureTime() is the old and inaccurate function name. + // To keep backward compatible. If users implement their own + // statistics, which overrides measureTime() but doesn't override + // this function. We forward to measureTime(). + measureTime(histogramType, time); + } + + // Resets all ticker and histogram stats + virtual Status Reset() { return Status::NotSupported("Not implemented"); } + + // String representation of the statistic object. + virtual std::string ToString() const { + // Do nothing by default + return std::string("ToString(): not implemented"); + } + + virtual bool getTickerMap(std::map<std::string, uint64_t>*) const { + // Do nothing by default + return false; + } + + // Override this function to disable particular histogram collection + virtual bool HistEnabledForType(uint32_t type) const { + return type < HISTOGRAM_ENUM_MAX; + } + void set_stats_level(StatsLevel sl) { + stats_level_.store(sl, std::memory_order_relaxed); + } + StatsLevel get_stats_level() const { + return stats_level_.load(std::memory_order_relaxed); + } + + private: + std::atomic<StatsLevel> stats_level_{kExceptDetailedTimers}; +}; + +// Create a concrete DBStatistics object +std::shared_ptr<Statistics> CreateDBStatistics(); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/stats_history.h b/src/rocksdb/include/rocksdb/stats_history.h new file mode 100644 index 000000000..4acaad26f --- /dev/null +++ b/src/rocksdb/include/rocksdb/stats_history.h @@ -0,0 +1,69 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <map> +#include <string> + +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; + +// StatsHistoryIterator is the main interface for users to programmatically +// access statistics snapshots that was automatically stored by RocksDB. +// Depending on options, the stats can be in memory or on disk. +// The stats snapshots are indexed by time that they were recorded, and each +// stats snapshot contains individual stat name and value at the time of +// recording. +// Example: +// std::unique_ptr<StatsHistoryIterator> stats_iter; +// Status s = db->GetStatsHistory(0 /* start_time */, +// env->NowMicros() /* end_time*/, +// &stats_iter); +// if (s.ok) { +// for (; stats_iter->Valid(); stats_iter->Next()) { +// uint64_t stats_time = stats_iter->GetStatsTime(); +// const std::map<std::string, uint64_t>& stats_map = +// stats_iter->GetStatsMap(); +// process(stats_time, stats_map); +// } +// } +class StatsHistoryIterator { + public: + StatsHistoryIterator() {} + virtual ~StatsHistoryIterator() {} + + virtual bool Valid() const = 0; + + // Moves to the next stats history record. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Return the time stamp (in seconds) when stats history is recorded. + // REQUIRES: Valid() + virtual uint64_t GetStatsTime() const = 0; + + virtual int GetFormatVersion() const { return -1; } + + // Return the current stats history as an std::map which specifies the + // mapping from stats name to stats value . The underlying storage + // for the returned map is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual const std::map<std::string, uint64_t>& GetStatsMap() const = 0; + + // If an error has occurred, return it. Else return an ok status. + virtual Status status() const = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/status.h b/src/rocksdb/include/rocksdb/status.h new file mode 100644 index 000000000..cf62512c3 --- /dev/null +++ b/src/rocksdb/include/rocksdb/status.h @@ -0,0 +1,386 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#pragma once + +#include <string> +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class Status { + public: + // Create a success status. + Status() : code_(kOk), subcode_(kNone), sev_(kNoError), state_(nullptr) {} + ~Status() { delete[] state_; } + + // Copy the specified status. + Status(const Status& s); + Status& operator=(const Status& s); + Status(Status&& s) +#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900)) + noexcept +#endif + ; + Status& operator=(Status&& s) +#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900)) + noexcept +#endif + ; + bool operator==(const Status& rhs) const; + bool operator!=(const Status& rhs) const; + + enum Code : unsigned char { + kOk = 0, + kNotFound = 1, + kCorruption = 2, + kNotSupported = 3, + kInvalidArgument = 4, + kIOError = 5, + kMergeInProgress = 6, + kIncomplete = 7, + kShutdownInProgress = 8, + kTimedOut = 9, + kAborted = 10, + kBusy = 11, + kExpired = 12, + kTryAgain = 13, + kCompactionTooLarge = 14, + kColumnFamilyDropped = 15, + kMaxCode + }; + + Code code() const { return code_; } + + enum SubCode : unsigned char { + kNone = 0, + kMutexTimeout = 1, + kLockTimeout = 2, + kLockLimit = 3, + kNoSpace = 4, + kDeadlock = 5, + kStaleFile = 6, + kMemoryLimit = 7, + kSpaceLimit = 8, + kPathNotFound = 9, + KMergeOperandsInsufficientCapacity = 10, + kManualCompactionPaused = 11, + kMaxSubCode + }; + + SubCode subcode() const { return subcode_; } + + enum Severity : unsigned char { + kNoError = 0, + kSoftError = 1, + kHardError = 2, + kFatalError = 3, + kUnrecoverableError = 4, + kMaxSeverity + }; + + Status(const Status& s, Severity sev); + Severity severity() const { return sev_; } + + // Returns a C style string indicating the message of the Status + const char* getState() const { return state_; } + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotFound, msg, msg2); + } + // Fast path for not found without malloc; + static Status NotFound(SubCode msg = kNone) { return Status(kNotFound, msg); } + + static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kCorruption, msg, msg2); + } + static Status Corruption(SubCode msg = kNone) { + return Status(kCorruption, msg); + } + + static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotSupported, msg, msg2); + } + static Status NotSupported(SubCode msg = kNone) { + return Status(kNotSupported, msg); + } + + static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, msg, msg2); + } + static Status InvalidArgument(SubCode msg = kNone) { + return Status(kInvalidArgument, msg); + } + + static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, msg, msg2); + } + static Status IOError(SubCode msg = kNone) { return Status(kIOError, msg); } + + static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kMergeInProgress, msg, msg2); + } + static Status MergeInProgress(SubCode msg = kNone) { + return Status(kMergeInProgress, msg); + } + + static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIncomplete, msg, msg2); + } + static Status Incomplete(SubCode msg = kNone) { + return Status(kIncomplete, msg); + } + + static Status ShutdownInProgress(SubCode msg = kNone) { + return Status(kShutdownInProgress, msg); + } + static Status ShutdownInProgress(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kShutdownInProgress, msg, msg2); + } + static Status Aborted(SubCode msg = kNone) { return Status(kAborted, msg); } + static Status Aborted(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kAborted, msg, msg2); + } + + static Status Busy(SubCode msg = kNone) { return Status(kBusy, msg); } + static Status Busy(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kBusy, msg, msg2); + } + + static Status TimedOut(SubCode msg = kNone) { return Status(kTimedOut, msg); } + static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kTimedOut, msg, msg2); + } + + static Status Expired(SubCode msg = kNone) { return Status(kExpired, msg); } + static Status Expired(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kExpired, msg, msg2); + } + + static Status TryAgain(SubCode msg = kNone) { return Status(kTryAgain, msg); } + static Status TryAgain(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kTryAgain, msg, msg2); + } + + static Status CompactionTooLarge(SubCode msg = kNone) { + return Status(kCompactionTooLarge, msg); + } + static Status CompactionTooLarge(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kCompactionTooLarge, msg, msg2); + } + + static Status ColumnFamilyDropped(SubCode msg = kNone) { + return Status(kColumnFamilyDropped, msg); + } + + static Status ColumnFamilyDropped(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kColumnFamilyDropped, msg, msg2); + } + + static Status NoSpace() { return Status(kIOError, kNoSpace); } + static Status NoSpace(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, kNoSpace, msg, msg2); + } + + static Status MemoryLimit() { return Status(kAborted, kMemoryLimit); } + static Status MemoryLimit(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kAborted, kMemoryLimit, msg, msg2); + } + + static Status SpaceLimit() { return Status(kIOError, kSpaceLimit); } + static Status SpaceLimit(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, kSpaceLimit, msg, msg2); + } + + static Status PathNotFound() { return Status(kIOError, kPathNotFound); } + static Status PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, kPathNotFound, msg, msg2); + } + + // Returns true iff the status indicates success. + bool ok() const { return code() == kOk; } + + // Returns true iff the status indicates a NotFound error. + bool IsNotFound() const { return code() == kNotFound; } + + // Returns true iff the status indicates a Corruption error. + bool IsCorruption() const { return code() == kCorruption; } + + // Returns true iff the status indicates a NotSupported error. + bool IsNotSupported() const { return code() == kNotSupported; } + + // Returns true iff the status indicates an InvalidArgument error. + bool IsInvalidArgument() const { return code() == kInvalidArgument; } + + // Returns true iff the status indicates an IOError. + bool IsIOError() const { return code() == kIOError; } + + // Returns true iff the status indicates an MergeInProgress. + bool IsMergeInProgress() const { return code() == kMergeInProgress; } + + // Returns true iff the status indicates Incomplete + bool IsIncomplete() const { return code() == kIncomplete; } + + // Returns true iff the status indicates Shutdown In progress + bool IsShutdownInProgress() const { return code() == kShutdownInProgress; } + + bool IsTimedOut() const { return code() == kTimedOut; } + + bool IsAborted() const { return code() == kAborted; } + + bool IsLockLimit() const { + return code() == kAborted && subcode() == kLockLimit; + } + + // Returns true iff the status indicates that a resource is Busy and + // temporarily could not be acquired. + bool IsBusy() const { return code() == kBusy; } + + bool IsDeadlock() const { return code() == kBusy && subcode() == kDeadlock; } + + // Returns true iff the status indicated that the operation has Expired. + bool IsExpired() const { return code() == kExpired; } + + // Returns true iff the status indicates a TryAgain error. + // This usually means that the operation failed, but may succeed if + // re-attempted. + bool IsTryAgain() const { return code() == kTryAgain; } + + // Returns true iff the status indicates the proposed compaction is too large + bool IsCompactionTooLarge() const { return code() == kCompactionTooLarge; } + + // Returns true iff the status indicates Column Family Dropped + bool IsColumnFamilyDropped() const { return code() == kColumnFamilyDropped; } + + // Returns true iff the status indicates a NoSpace error + // This is caused by an I/O error returning the specific "out of space" + // error condition. Stricto sensu, an NoSpace error is an I/O error + // with a specific subcode, enabling users to take the appropriate action + // if needed + bool IsNoSpace() const { + return (code() == kIOError) && (subcode() == kNoSpace); + } + + // Returns true iff the status indicates a memory limit error. There may be + // cases where we limit the memory used in certain operations (eg. the size + // of a write batch) in order to avoid out of memory exceptions. + bool IsMemoryLimit() const { + return (code() == kAborted) && (subcode() == kMemoryLimit); + } + + // Returns true iff the status indicates a PathNotFound error + // This is caused by an I/O error returning the specific "no such file or + // directory" error condition. A PathNotFound error is an I/O error with + // a specific subcode, enabling users to take appropriate action if necessary + bool IsPathNotFound() const { + return (code() == kIOError) && (subcode() == kPathNotFound); + } + + // Returns true iff the status indicates manual compaction paused. This + // is caused by a call to PauseManualCompaction + bool IsManualCompactionPaused() const { + return (code() == kIncomplete) && (subcode() == kManualCompactionPaused); + } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + protected: + // A nullptr state_ (which is always the case for OK) means the message + // is empty. + // of the following form: + // state_[0..3] == length of message + // state_[4..] == message + Code code_; + SubCode subcode_; + Severity sev_; + const char* state_; + + explicit Status(Code _code, SubCode _subcode = kNone) + : code_(_code), subcode_(_subcode), sev_(kNoError), state_(nullptr) {} + + Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2); + Status(Code _code, const Slice& msg, const Slice& msg2) + : Status(_code, kNone, msg, msg2) {} + + static const char* CopyState(const char* s); +}; + +inline Status::Status(const Status& s) + : code_(s.code_), subcode_(s.subcode_), sev_(s.sev_) { + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); +} +inline Status::Status(const Status& s, Severity sev) + : code_(s.code_), subcode_(s.subcode_), sev_(sev) { + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); +} +inline Status& Status::operator=(const Status& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + if (this != &s) { + code_ = s.code_; + subcode_ = s.subcode_; + sev_ = s.sev_; + delete[] state_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); + } + return *this; +} + +inline Status::Status(Status&& s) +#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900)) + noexcept +#endif + : Status() { + *this = std::move(s); +} + +inline Status& Status::operator=(Status&& s) +#if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900)) + noexcept +#endif +{ + if (this != &s) { + code_ = std::move(s.code_); + s.code_ = kOk; + subcode_ = std::move(s.subcode_); + s.subcode_ = kNone; + sev_ = std::move(s.sev_); + s.sev_ = kNoError; + delete[] state_; + state_ = nullptr; + std::swap(state_, s.state_); + } + return *this; +} + +inline bool Status::operator==(const Status& rhs) const { + return (code_ == rhs.code_); +} + +inline bool Status::operator!=(const Status& rhs) const { + return !(*this == rhs); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/table.h b/src/rocksdb/include/rocksdb/table.h new file mode 100644 index 000000000..fb5d67114 --- /dev/null +++ b/src/rocksdb/include/rocksdb/table.h @@ -0,0 +1,607 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Currently we support two types of tables: plain table and block-based table. +// 1. Block-based table: this is the default table type that we inherited from +// LevelDB, which was designed for storing data in hard disk or flash +// device. +// 2. Plain table: it is one of RocksDB's SST file format optimized +// for low query latency on pure-memory or really low-latency media. +// +// A tutorial of rocksdb table formats is available here: +// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats +// +// Example code is also available +// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples + +#pragma once + +#include <memory> +#include <string> +#include <unordered_map> + +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// -- Block-based Table +class FlushBlockPolicyFactory; +class PersistentCache; +class RandomAccessFile; +struct TableReaderOptions; +struct TableBuilderOptions; +class TableBuilder; +class TableReader; +class WritableFileWriter; +struct EnvOptions; +struct Options; + +enum ChecksumType : char { + kNoChecksum = 0x0, + kCRC32c = 0x1, + kxxHash = 0x2, + kxxHash64 = 0x3, +}; + +// For advanced user only +struct BlockBasedTableOptions { + // @flush_block_policy_factory creates the instances of flush block policy. + // which provides a configurable way to determine when to flush a block in + // the block based tables. If not set, table builder will use the default + // block flush policy, which cut blocks by block size (please refer to + // `FlushBlockBySizePolicy`). + std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory; + + // TODO(kailiu) Temporarily disable this feature by making the default value + // to be false. + // + // TODO(ajkr) we need to update names of variables controlling meta-block + // caching as they should now apply to range tombstone and compression + // dictionary meta-blocks, in addition to index and filter meta-blocks. + // + // Indicating if we'd put index/filter blocks to the block cache. + // If not specified, each "table reader" object will pre-load index/filter + // block during table initialization. + bool cache_index_and_filter_blocks = false; + + // If cache_index_and_filter_blocks is enabled, cache index and filter + // blocks with high priority. If set to true, depending on implementation of + // block cache, index and filter blocks may be less likely to be evicted + // than data blocks. + bool cache_index_and_filter_blocks_with_high_priority = true; + + // if cache_index_and_filter_blocks is true and the below is true, then + // filter and index blocks are stored in the cache, but a reference is + // held in the "table reader" object so the blocks are pinned and only + // evicted from cache when the table reader is freed. + bool pin_l0_filter_and_index_blocks_in_cache = false; + + // If cache_index_and_filter_blocks is true and the below is true, then + // the top-level index of partitioned filter and index blocks are stored in + // the cache, but a reference is held in the "table reader" object so the + // blocks are pinned and only evicted from cache when the table reader is + // freed. This is not limited to l0 in LSM tree. + bool pin_top_level_index_and_filter = true; + + // The index type that will be used for this table. + enum IndexType : char { + // A space efficient index block that is optimized for + // binary-search-based index. + kBinarySearch = 0x00, + + // The hash index, if enabled, will do the hash lookup when + // `Options.prefix_extractor` is provided. + kHashSearch = 0x01, + + // A two-level index implementation. Both levels are binary search indexes. + kTwoLevelIndexSearch = 0x02, + + // Like kBinarySearch, but index also contains first key of each block. + // This allows iterators to defer reading the block until it's actually + // needed. May significantly reduce read amplification of short range scans. + // Without it, iterator seek usually reads one block from each level-0 file + // and from each level, which may be expensive. + // Works best in combination with: + // - IndexShorteningMode::kNoShortening, + // - custom FlushBlockPolicy to cut blocks at some meaningful boundaries, + // e.g. when prefix changes. + // Makes the index significantly bigger (2x or more), especially when keys + // are long. + // + // IO errors are not handled correctly in this mode right now: if an error + // happens when lazily reading a block in value(), value() returns empty + // slice, and you need to call Valid()/status() afterwards. + // TODO(kolmike): Fix it. + kBinarySearchWithFirstKey = 0x03, + }; + + IndexType index_type = kBinarySearch; + + // The index type that will be used for the data block. + enum DataBlockIndexType : char { + kDataBlockBinarySearch = 0, // traditional block type + kDataBlockBinaryAndHash = 1, // additional hash index + }; + + DataBlockIndexType data_block_index_type = kDataBlockBinarySearch; + + // #entries/#buckets. It is valid only when data_block_hash_index_type is + // kDataBlockBinaryAndHash. + double data_block_hash_table_util_ratio = 0.75; + + // This option is now deprecated. No matter what value it is set to, + // it will behave as if hash_index_allow_collision=true. + bool hash_index_allow_collision = true; + + // Use the specified checksum type. Newly created table files will be + // protected with this checksum type. Old table files will still be readable, + // even though they have different checksum type. + ChecksumType checksum = kCRC32c; + + // Disable block cache. If this is set to true, + // then no block cache should be used, and the block_cache should + // point to a nullptr object. + bool no_block_cache = false; + + // If non-NULL use the specified cache for blocks. + // If NULL, rocksdb will automatically create and use an 8MB internal cache. + std::shared_ptr<Cache> block_cache = nullptr; + + // If non-NULL use the specified cache for pages read from device + // IF NULL, no page cache is used + std::shared_ptr<PersistentCache> persistent_cache = nullptr; + + // If non-NULL use the specified cache for compressed blocks. + // If NULL, rocksdb will not use a compressed block cache. + // Note: though it looks similar to `block_cache`, RocksDB doesn't put the + // same type of object there. + std::shared_ptr<Cache> block_cache_compressed = nullptr; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + size_t block_size = 4 * 1024; + + // This is used to close a block before it reaches the configured + // 'block_size'. If the percentage of free space in the current block is less + // than this specified number and adding a new record to the block will + // exceed the configured block size, then this block will be closed and the + // new record will be written to the next block. + int block_size_deviation = 10; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. The minimum value allowed is 1. Any smaller + // value will be silently overwritten with 1. + int block_restart_interval = 16; + + // Same as block_restart_interval but used for the index block. + int index_block_restart_interval = 1; + + // Block size for partitioned metadata. Currently applied to indexes when + // kTwoLevelIndexSearch is used and to filters when partition_filters is used. + // Note: Since in the current implementation the filters and index partitions + // are aligned, an index/filter block is created when either index or filter + // block size reaches the specified limit. + // Note: this limit is currently applied to only index blocks; a filter + // partition is cut right after an index block is cut + // TODO(myabandeh): remove the note above when filter partitions are cut + // separately + uint64_t metadata_block_size = 4096; + + // Note: currently this option requires kTwoLevelIndexSearch to be set as + // well. + // TODO(myabandeh): remove the note above once the limitation is lifted + // Use partitioned full filters for each SST file. This option is + // incompatible with block-based filters. + bool partition_filters = false; + + // Use delta encoding to compress keys in blocks. + // ReadOptions::pin_data requires this option to be disabled. + // + // Default: true + bool use_delta_encoding = true; + + // If non-nullptr, use the specified filter policy to reduce disk reads. + // Many applications will benefit from passing the result of + // NewBloomFilterPolicy() here. + std::shared_ptr<const FilterPolicy> filter_policy = nullptr; + + // If true, place whole keys in the filter (not just prefixes). + // This must generally be true for gets to be efficient. + bool whole_key_filtering = true; + + // Verify that decompressing the compressed block gives back the input. This + // is a verification mode that we use to detect bugs in compression + // algorithms. + bool verify_compression = false; + + // If used, For every data block we load into memory, we will create a bitmap + // of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap + // will be used to figure out the percentage we actually read of the blocks. + // + // When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and + // Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the + // read amplification using this formula + // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES) + // + // value => memory usage (percentage of loaded blocks memory) + // 1 => 12.50 % + // 2 => 06.25 % + // 4 => 03.12 % + // 8 => 01.56 % + // 16 => 00.78 % + // + // Note: This number must be a power of 2, if not it will be sanitized + // to be the next lowest power of 2, for example a value of 7 will be + // treated as 4, a value of 19 will be treated as 16. + // + // Default: 0 (disabled) + uint32_t read_amp_bytes_per_bit = 0; + + // We currently have five versions: + // 0 -- This version is currently written out by all RocksDB's versions by + // default. Can be read by really old RocksDB's. Doesn't support changing + // checksum (default is CRC32). + // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default + // checksum, like xxHash. It is written by RocksDB when + // BlockBasedTableOptions::checksum is something other than kCRC32c. (version + // 0 is silently upconverted) + // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we + // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you + // don't plan to run RocksDB before version 3.10, you should probably use + // this. + // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we + // encode the keys in index blocks. If you don't plan to run RocksDB before + // version 5.15, you should probably use this. + // This option only affects newly written tables. When reading existing + // tables, the information about version is read from the footer. + // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we + // encode the values in index blocks. If you don't plan to run RocksDB before + // version 5.16 and you are using index_block_restart_interval > 1, you should + // probably use this as it would reduce the index size. + // This option only affects newly written tables. When reading existing + // tables, the information about version is read from the footer. + // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned + // filters use a generally faster and more accurate Bloom filter + // implementation, with a different schema. + uint32_t format_version = 2; + + // Store index blocks on disk in compressed format. Changing this option to + // false will avoid the overhead of decompression if index blocks are evicted + // and read back + bool enable_index_compression = true; + + // Align data blocks on lesser of page size and block size + bool block_align = false; + + // This enum allows trading off increased index size for improved iterator + // seek performance in some situations, particularly when block cache is + // disabled (ReadOptions::fill_cache = false) and direct IO is + // enabled (DBOptions::use_direct_reads = true). + // The default mode is the best tradeoff for most use cases. + // This option only affects newly written tables. + // + // The index contains a key separating each pair of consecutive blocks. + // Let A be the highest key in one block, B the lowest key in the next block, + // and I the index entry separating these two blocks: + // [ ... A] I [B ...] + // I is allowed to be anywhere in [A, B). + // If an iterator is seeked to a key in (A, I], we'll unnecessarily read the + // first block, then immediately fall through to the second block. + // However, if I=A, this can't happen, and we'll read only the second block. + // In kNoShortening mode, we use I=A. In other modes, we use the shortest + // key in [A, B), which usually significantly reduces index size. + // + // There's a similar story for the last index entry, which is an upper bound + // of the highest key in the file. If it's shortened and therefore + // overestimated, iterator is likely to unnecessarily read the last data block + // from each file on each seek. + enum class IndexShorteningMode : char { + // Use full keys. + kNoShortening, + // Shorten index keys between blocks, but use full key for the last index + // key, which is the upper bound of the whole file. + kShortenSeparators, + // Shorten both keys between blocks and key after last block. + kShortenSeparatorsAndSuccessor, + }; + + IndexShorteningMode index_shortening = + IndexShorteningMode::kShortenSeparators; +}; + +// Table Properties that are specific to block-based table properties. +struct BlockBasedTablePropertyNames { + // value of this properties is a fixed int32 number. + static const std::string kIndexType; + // value is "1" for true and "0" for false. + static const std::string kWholeKeyFiltering; + // value is "1" for true and "0" for false. + static const std::string kPrefixFiltering; +}; + +// Create default block based table factory. +extern TableFactory* NewBlockBasedTableFactory( + const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); + +#ifndef ROCKSDB_LITE + +enum EncodingType : char { + // Always write full keys without any special encoding. + kPlain, + // Find opportunity to write the same prefix once for multiple rows. + // In some cases, when a key follows a previous key with the same prefix, + // instead of writing out the full key, it just writes out the size of the + // shared prefix, as well as other bytes, to save some bytes. + // + // When using this option, the user is required to use the same prefix + // extractor to make sure the same prefix will be extracted from the same key. + // The Name() value of the prefix extractor will be stored in the file. When + // reopening the file, the name of the options.prefix_extractor given will be + // bitwise compared to the prefix extractors stored in the file. An error + // will be returned if the two don't match. + kPrefix, +}; + +// Table Properties that are specific to plain table properties. +struct PlainTablePropertyNames { + static const std::string kEncodingType; + static const std::string kBloomVersion; + static const std::string kNumBloomBlocks; +}; + +const uint32_t kPlainTableVariableLength = 0; + +struct PlainTableOptions { + // @user_key_len: plain table has optimization for fix-sized keys, which can + // be specified via user_key_len. Alternatively, you can pass + // `kPlainTableVariableLength` if your keys have variable + // lengths. + uint32_t user_key_len = kPlainTableVariableLength; + + // @bloom_bits_per_key: the number of bits used for bloom filer per prefix. + // You may disable it by passing a zero. + int bloom_bits_per_key = 10; + + // @hash_table_ratio: the desired utilization of the hash table used for + // prefix hashing. + // hash_table_ratio = number of prefixes / #buckets in the + // hash table + double hash_table_ratio = 0.75; + + // @index_sparseness: inside each prefix, need to build one index record for + // how many keys for binary search inside each hash bucket. + // For encoding type kPrefix, the value will be used when + // writing to determine an interval to rewrite the full + // key. It will also be used as a suggestion and satisfied + // when possible. + size_t index_sparseness = 16; + + // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc. + // Otherwise from huge page TLB. The user needs to + // reserve huge pages for it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + size_t huge_page_tlb_size = 0; + + // @encoding_type: how to encode the keys. See enum EncodingType above for + // the choices. The value will determine how to encode keys + // when writing to a new SST file. This value will be stored + // inside the SST file which will be used when reading from + // the file, which makes it possible for users to choose + // different encoding type when reopening a DB. Files with + // different encoding types can co-exist in the same DB and + // can be read. + EncodingType encoding_type = kPlain; + + // @full_scan_mode: mode for reading the whole file one record by one without + // using the index. + bool full_scan_mode = false; + + // @store_index_in_file: compute plain table index and bloom filter during + // file building and store it in file. When reading + // file, index will be mmaped instead of recomputation. + bool store_index_in_file = false; +}; + +// -- Plain Table with prefix-only seek +// For this factory, you need to set Options.prefix_extractor properly to make +// it work. Look-up will starts with prefix hash lookup for key prefix. Inside +// the hash bucket found, a binary search is executed for hash conflicts. +// Finally, a linear search is used. + +extern TableFactory* NewPlainTableFactory( + const PlainTableOptions& options = PlainTableOptions()); + +struct CuckooTablePropertyNames { + // The key that is used to fill empty buckets. + static const std::string kEmptyKey; + // Fixed length of value. + static const std::string kValueLength; + // Number of hash functions used in Cuckoo Hash. + static const std::string kNumHashFunc; + // It denotes the number of buckets in a Cuckoo Block. Given a key and a + // particular hash function, a Cuckoo Block is a set of consecutive buckets, + // where starting bucket id is given by the hash function on the key. In case + // of a collision during inserting the key, the builder tries to insert the + // key in other locations of the cuckoo block before using the next hash + // function. This reduces cache miss during read operation in case of + // collision. + static const std::string kCuckooBlockSize; + // Size of the hash table. Use this number to compute the modulo of hash + // function. The actual number of buckets will be kMaxHashTableSize + + // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to + // accommodate the Cuckoo Block from end of hash table, due to cache friendly + // implementation. + static const std::string kHashTableSize; + // Denotes if the key sorted in the file is Internal Key (if false) + // or User Key only (if true). + static const std::string kIsLastLevel; + // Indicate if using identity function for the first hash function. + static const std::string kIdentityAsFirstHash; + // Indicate if using module or bit and to calculate hash value + static const std::string kUseModuleHash; + // Fixed user key length + static const std::string kUserKeyLength; +}; + +struct CuckooTableOptions { + // Determines the utilization of hash tables. Smaller values + // result in larger hash tables with fewer collisions. + double hash_table_ratio = 0.9; + // A property used by builder to determine the depth to go to + // to search for a path to displace elements in case of + // collision. See Builder.MakeSpaceForKey method. Higher + // values result in more efficient hash tables with fewer + // lookups but take more time to build. + uint32_t max_search_depth = 100; + // In case of collision while inserting, the builder + // attempts to insert in the next cuckoo_block_size + // locations before skipping over to the next Cuckoo hash + // function. This makes lookups more cache friendly in case + // of collisions. + uint32_t cuckoo_block_size = 5; + // If this option is enabled, user key is treated as uint64_t and its value + // is used as hash value directly. This option changes builder's behavior. + // Reader ignore this option and behave according to what specified in table + // property. + bool identity_as_first_hash = false; + // If this option is set to true, module is used during hash calculation. + // This often yields better space efficiency at the cost of performance. + // If this option is set to false, # of entries in table is constrained to be + // power of two, and bit and is used to calculate hash, which is faster in + // general. + bool use_module_hash = true; +}; + +// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing +extern TableFactory* NewCuckooTableFactory( + const CuckooTableOptions& table_options = CuckooTableOptions()); + +#endif // ROCKSDB_LITE + +class RandomAccessFileReader; + +// A base class for table factories. +class TableFactory { + public: + virtual ~TableFactory() {} + + // The type of the table. + // + // The client of this package should switch to a new name whenever + // the table format implementation changes. + // + // Names starting with "rocksdb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Returns a Table object table that can fetch data from file specified + // in parameter file. It's the caller's responsibility to make sure + // file is in the correct format. + // + // NewTableReader() is called in three places: + // (1) TableCache::FindTable() calls the function when table cache miss + // and cache the table object returned. + // (2) SstFileDumper (for SST Dump) opens the table and dump the table + // contents using the iterator of the table. + // (3) DBImpl::IngestExternalFile() calls this function to read the contents + // of the sst file it's attempting to add + // + // table_reader_options is a TableReaderOptions which contain all the + // needed parameters and configuration to open the table. + // file is a file handler to handle the file for the table. + // file_size is the physical file size of the file. + // table_reader is the output table reader. + virtual Status NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table_reader, + bool prefetch_index_and_filter_in_cache = true) const = 0; + + // Return a table builder to write to a file for this table type. + // + // It is called in several places: + // (1) When flushing memtable to a level-0 output file, it creates a table + // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable()) + // (2) During compaction, it gets the builder for writing compaction output + // files in DBImpl::OpenCompactionOutputFile(). + // (3) When recovering from transaction logs, it creates a table builder to + // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery, + // by calling BuildTable()) + // (4) When running Repairer, it creates a table builder to convert logs to + // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) + // + // Multiple configured can be accessed from there, including and not limited + // to compression options. file is a handle of a writable file. + // It is the caller's responsibility to keep the file open and close the file + // after closing the table builder. compression_type is the compression type + // to use in this table. + virtual TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + uint32_t column_family_id, WritableFileWriter* file) const = 0; + + // Sanitizes the specified DB Options and ColumnFamilyOptions. + // + // If the function cannot find a way to sanitize the input DB Options, + // a non-ok Status will be returned. + virtual Status SanitizeOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const = 0; + + // Return a string that contains printable format of table configurations. + // RocksDB prints configurations at DB Open(). + virtual std::string GetPrintableTableOptions() const = 0; + + virtual Status GetOptionString(std::string* /*opt_string*/, + const std::string& /*delimiter*/) const { + return Status::NotSupported( + "The table factory doesn't implement GetOptionString()."); + } + + // Returns the raw pointer of the table options that is used by this + // TableFactory, or nullptr if this function is not supported. + // Since the return value is a raw pointer, the TableFactory owns the + // pointer and the caller should not delete the pointer. + // + // In certain case, it is desirable to alter the underlying options when the + // TableFactory is not used by any open DB by casting the returned pointer + // to the right class. For instance, if BlockBasedTableFactory is used, + // then the pointer can be casted to BlockBasedTableOptions. + // + // Note that changing the underlying TableFactory options while the + // TableFactory is currently used by any open DB is undefined behavior. + // Developers should use DB::SetOption() instead to dynamically change + // options while the DB is open. + virtual void* GetOptions() { return nullptr; } + + // Return is delete range supported + virtual bool IsDeleteRangeSupported() const { return false; } +}; + +#ifndef ROCKSDB_LITE +// Create a special table factory that can open either of the supported +// table formats, based on setting inside the SST files. It should be used to +// convert a DB from one table format to another. +// @table_factory_to_write: the table factory used when writing to new files. +// @block_based_table_factory: block based table factory to use. If NULL, use +// a default one. +// @plain_table_factory: plain table factory to use. If NULL, use a default one. +// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default +// one. +extern TableFactory* NewAdaptiveTableFactory( + std::shared_ptr<TableFactory> table_factory_to_write = nullptr, + std::shared_ptr<TableFactory> block_based_table_factory = nullptr, + std::shared_ptr<TableFactory> plain_table_factory = nullptr, + std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr); + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/table_properties.h b/src/rocksdb/include/rocksdb/table_properties.h new file mode 100644 index 000000000..d0ac02310 --- /dev/null +++ b/src/rocksdb/include/rocksdb/table_properties.h @@ -0,0 +1,250 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once + +#include <stdint.h> +#include <map> +#include <string> +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// -- Table Properties +// Other than basic table properties, each table may also have the user +// collected properties. +// The value of the user-collected properties are encoded as raw bytes -- +// users have to interpret these values by themselves. +// Note: To do prefix seek/scan in `UserCollectedProperties`, you can do +// something similar to: +// +// UserCollectedProperties props = ...; +// for (auto pos = props.lower_bound(prefix); +// pos != props.end() && pos->first.compare(0, prefix.size(), prefix) == 0; +// ++pos) { +// ... +// } +typedef std::map<std::string, std::string> UserCollectedProperties; + +// table properties' human-readable names in the property block. +struct TablePropertiesNames { + static const std::string kDataSize; + static const std::string kIndexSize; + static const std::string kIndexPartitions; + static const std::string kTopLevelIndexSize; + static const std::string kIndexKeyIsUserKey; + static const std::string kIndexValueIsDeltaEncoded; + static const std::string kFilterSize; + static const std::string kRawKeySize; + static const std::string kRawValueSize; + static const std::string kNumDataBlocks; + static const std::string kNumEntries; + static const std::string kDeletedKeys; + static const std::string kMergeOperands; + static const std::string kNumRangeDeletions; + static const std::string kFormatVersion; + static const std::string kFixedKeyLen; + static const std::string kFilterPolicy; + static const std::string kColumnFamilyName; + static const std::string kColumnFamilyId; + static const std::string kComparator; + static const std::string kMergeOperator; + static const std::string kPrefixExtractorName; + static const std::string kPropertyCollectors; + static const std::string kCompression; + static const std::string kCompressionOptions; + static const std::string kCreationTime; + static const std::string kOldestKeyTime; + static const std::string kFileCreationTime; +}; + +extern const std::string kPropertiesBlock; +extern const std::string kCompressionDictBlock; +extern const std::string kRangeDelBlock; + +// `TablePropertiesCollector` provides the mechanism for users to collect +// their own properties that they are interested in. This class is essentially +// a collection of callback functions that will be invoked during table +// building. It is constructed with TablePropertiesCollectorFactory. The methods +// don't need to be thread-safe, as we will create exactly one +// TablePropertiesCollector object per table and then call it sequentially +class TablePropertiesCollector { + public: + virtual ~TablePropertiesCollector() {} + + // DEPRECATE User defined collector should implement AddUserKey(), though + // this old function still works for backward compatible reason. + // Add() will be called when a new key/value pair is inserted into the table. + // @params key the user key that is inserted into the table. + // @params value the value that is inserted into the table. + virtual Status Add(const Slice& /*key*/, const Slice& /*value*/) { + return Status::InvalidArgument( + "TablePropertiesCollector::Add() deprecated."); + } + + // AddUserKey() will be called when a new key/value pair is inserted into the + // table. + // @params key the user key that is inserted into the table. + // @params value the value that is inserted into the table. + virtual Status AddUserKey(const Slice& key, const Slice& value, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) { + // For backwards-compatibility. + return Add(key, value); + } + + // Called after each new block is cut + virtual void BlockAdd(uint64_t /* blockRawBytes */, + uint64_t /* blockCompressedBytesFast */, + uint64_t /* blockCompressedBytesSlow */) { + // Nothing to do here. Callback registers can override. + return; + } + + // Finish() will be called when a table has already been built and is ready + // for writing the properties block. + // @params properties User will add their collected statistics to + // `properties`. + virtual Status Finish(UserCollectedProperties* properties) = 0; + + // Return the human-readable properties, where the key is property name and + // the value is the human-readable form of value. + virtual UserCollectedProperties GetReadableProperties() const = 0; + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const = 0; + + // EXPERIMENTAL Return whether the output file should be further compacted + virtual bool NeedCompact() const { return false; } +}; + +// Constructs TablePropertiesCollector. Internals create a new +// TablePropertiesCollector for each new table +class TablePropertiesCollectorFactory { + public: + struct Context { + uint32_t column_family_id; + static const uint32_t kUnknownColumnFamily; + }; + + virtual ~TablePropertiesCollectorFactory() {} + // has to be thread-safe + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) = 0; + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const = 0; +}; + +// TableProperties contains a bunch of read-only properties of its associated +// table. +struct TableProperties { + public: + // the total size of all data blocks. + uint64_t data_size = 0; + // the size of index block. + uint64_t index_size = 0; + // Total number of index partitions if kTwoLevelIndexSearch is used + uint64_t index_partitions = 0; + // Size of the top-level index if kTwoLevelIndexSearch is used + uint64_t top_level_index_size = 0; + // Whether the index key is user key. Otherwise it includes 8 byte of sequence + // number added by internal key format. + uint64_t index_key_is_user_key = 0; + // Whether delta encoding is used to encode the index values. + uint64_t index_value_is_delta_encoded = 0; + // the size of filter block. + uint64_t filter_size = 0; + // total raw key size + uint64_t raw_key_size = 0; + // total raw value size + uint64_t raw_value_size = 0; + // the number of blocks in this table + uint64_t num_data_blocks = 0; + // the number of entries in this table + uint64_t num_entries = 0; + // the number of deletions in the table + uint64_t num_deletions = 0; + // the number of merge operands in the table + uint64_t num_merge_operands = 0; + // the number of range deletions in this table + uint64_t num_range_deletions = 0; + // format version, reserved for backward compatibility + uint64_t format_version = 0; + // If 0, key is variable length. Otherwise number of bytes for each key. + uint64_t fixed_key_len = 0; + // ID of column family for this SST file, corresponding to the CF identified + // by column_family_name. + uint64_t column_family_id = ROCKSDB_NAMESPACE:: + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily; + // Timestamp of the latest key. 0 means unknown. + // TODO(sagar0): Should be changed to latest_key_time ... but don't know the + // full implications of backward compatibility. Hence retaining for now. + uint64_t creation_time = 0; + // Timestamp of the earliest key. 0 means unknown. + uint64_t oldest_key_time = 0; + // Actual SST file creation time. 0 means unknown. + uint64_t file_creation_time = 0; + + // Name of the column family with which this SST file is associated. + // If column family is unknown, `column_family_name` will be an empty string. + std::string column_family_name; + + // The name of the filter policy used in this table. + // If no filter policy is used, `filter_policy_name` will be an empty string. + std::string filter_policy_name; + + // The name of the comparator used in this table. + std::string comparator_name; + + // The name of the merge operator used in this table. + // If no merge operator is used, `merge_operator_name` will be "nullptr". + std::string merge_operator_name; + + // The name of the prefix extractor used in this table + // If no prefix extractor is used, `prefix_extractor_name` will be "nullptr". + std::string prefix_extractor_name; + + // The names of the property collectors factories used in this table + // separated by commas + // {collector_name[1]},{collector_name[2]},{collector_name[3]} .. + std::string property_collectors_names; + + // The compression algo used to compress the SST files. + std::string compression_name; + + // Compression options used to compress the SST files. + std::string compression_options; + + // user collected properties + UserCollectedProperties user_collected_properties; + UserCollectedProperties readable_properties; + + // The offset of the value of each property in the file. + std::map<std::string, uint64_t> properties_offsets; + + // convert this object to a human readable form + // @prop_delim: delimiter for each property. + std::string ToString(const std::string& prop_delim = "; ", + const std::string& kv_delim = "=") const; + + // Aggregate the numerical member variables of the specified + // TableProperties. + void Add(const TableProperties& tp); +}; + +// Extra properties +// Below is a list of non-basic properties that are collected by database +// itself. Especially some properties regarding to the internal keys (which +// is unknown to `table`). +// +// DEPRECATED: these properties now belong as TableProperties members. Please +// use TableProperties::num_deletions and TableProperties::num_merge_operands, +// respectively. +extern uint64_t GetDeletedKeys(const UserCollectedProperties& props); +extern uint64_t GetMergeOperands(const UserCollectedProperties& props, + bool* property_present); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/thread_status.h b/src/rocksdb/include/rocksdb/thread_status.h new file mode 100644 index 000000000..6b2f5c885 --- /dev/null +++ b/src/rocksdb/include/rocksdb/thread_status.h @@ -0,0 +1,188 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file defines the structures for exposing run-time status of any +// rocksdb-related thread. Such run-time status can be obtained via +// GetThreadList() API. +// +// Note that all thread-status features are still under-development, and +// thus APIs and class definitions might subject to change at this point. +// Will remove this comment once the APIs have been finalized. + +#pragma once + +#include <stdint.h> +#include <cstddef> +#include <map> +#include <string> +#include <utility> +#include <vector> + +#if !defined(ROCKSDB_LITE) && !defined(NROCKSDB_THREAD_STATUS) && \ + defined(ROCKSDB_SUPPORT_THREAD_LOCAL) +#define ROCKSDB_USING_THREAD_STATUS +#endif + +namespace ROCKSDB_NAMESPACE { + +// TODO(yhchiang): remove this function once c++14 is available +// as std::max will be able to cover this. +// Current MS compiler does not support constexpr +template <int A, int B> +struct constexpr_max { + static const int result = (A > B) ? A : B; +}; + +// A structure that describes the current status of a thread. +// The status of active threads can be fetched using +// ROCKSDB_NAMESPACE::GetThreadList(). +struct ThreadStatus { + // The type of a thread. + enum ThreadType : int { + HIGH_PRIORITY = 0, // RocksDB BG thread in high-pri thread pool + LOW_PRIORITY, // RocksDB BG thread in low-pri thread pool + USER, // User thread (Non-RocksDB BG thread) + BOTTOM_PRIORITY, // RocksDB BG thread in bottom-pri thread pool + NUM_THREAD_TYPES + }; + + // The type used to refer to a thread operation. + // A thread operation describes high-level action of a thread. + // Examples include compaction and flush. + enum OperationType : int { + OP_UNKNOWN = 0, + OP_COMPACTION, + OP_FLUSH, + NUM_OP_TYPES + }; + + enum OperationStage : int { + STAGE_UNKNOWN = 0, + STAGE_FLUSH_RUN, + STAGE_FLUSH_WRITE_L0, + STAGE_COMPACTION_PREPARE, + STAGE_COMPACTION_RUN, + STAGE_COMPACTION_PROCESS_KV, + STAGE_COMPACTION_INSTALL, + STAGE_COMPACTION_SYNC_FILE, + STAGE_PICK_MEMTABLES_TO_FLUSH, + STAGE_MEMTABLE_ROLLBACK, + STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS, + NUM_OP_STAGES + }; + + enum CompactionPropertyType : int { + COMPACTION_JOB_ID = 0, + COMPACTION_INPUT_OUTPUT_LEVEL, + COMPACTION_PROP_FLAGS, + COMPACTION_TOTAL_INPUT_BYTES, + COMPACTION_BYTES_READ, + COMPACTION_BYTES_WRITTEN, + NUM_COMPACTION_PROPERTIES + }; + + enum FlushPropertyType : int { + FLUSH_JOB_ID = 0, + FLUSH_BYTES_MEMTABLES, + FLUSH_BYTES_WRITTEN, + NUM_FLUSH_PROPERTIES + }; + + // The maximum number of properties of an operation. + // This number should be set to the biggest NUM_XXX_PROPERTIES. + static const int kNumOperationProperties = + constexpr_max<NUM_COMPACTION_PROPERTIES, NUM_FLUSH_PROPERTIES>::result; + + // The type used to refer to a thread state. + // A state describes lower-level action of a thread + // such as reading / writing a file or waiting for a mutex. + enum StateType : int { + STATE_UNKNOWN = 0, + STATE_MUTEX_WAIT = 1, + NUM_STATE_TYPES + }; + + ThreadStatus(const uint64_t _id, const ThreadType _thread_type, + const std::string& _db_name, const std::string& _cf_name, + const OperationType _operation_type, + const uint64_t _op_elapsed_micros, + const OperationStage _operation_stage, + const uint64_t _op_props[], const StateType _state_type) + : thread_id(_id), + thread_type(_thread_type), + db_name(_db_name), + cf_name(_cf_name), + operation_type(_operation_type), + op_elapsed_micros(_op_elapsed_micros), + operation_stage(_operation_stage), + state_type(_state_type) { + for (int i = 0; i < kNumOperationProperties; ++i) { + op_properties[i] = _op_props[i]; + } + } + + // An unique ID for the thread. + const uint64_t thread_id; + + // The type of the thread, it could be HIGH_PRIORITY, + // LOW_PRIORITY, and USER + const ThreadType thread_type; + + // The name of the DB instance where the thread is currently + // involved with. It would be set to empty string if the thread + // does not involve in any DB operation. + const std::string db_name; + + // The name of the column family where the thread is currently + // It would be set to empty string if the thread does not involve + // in any column family. + const std::string cf_name; + + // The operation (high-level action) that the current thread is involved. + const OperationType operation_type; + + // The elapsed time of the current thread operation in microseconds. + const uint64_t op_elapsed_micros; + + // An integer showing the current stage where the thread is involved + // in the current operation. + const OperationStage operation_stage; + + // A list of properties that describe some details about the current + // operation. Same field in op_properties[] might have different + // meanings for different operations. + uint64_t op_properties[kNumOperationProperties]; + + // The state (lower-level action) that the current thread is involved. + const StateType state_type; + + // The followings are a set of utility functions for interpreting + // the information of ThreadStatus + + static std::string GetThreadTypeName(ThreadType thread_type); + + // Obtain the name of an operation given its type. + static const std::string& GetOperationName(OperationType op_type); + + static const std::string MicrosToString(uint64_t op_elapsed_time); + + // Obtain a human-readable string describing the specified operation stage. + static const std::string& GetOperationStageName(OperationStage stage); + + // Obtain the name of the "i"th operation property of the + // specified operation. + static const std::string& GetOperationPropertyName(OperationType op_type, + int i); + + // Translate the "i"th property of the specified operation given + // a property value. + static std::map<std::string, uint64_t> InterpretOperationProperties( + OperationType op_type, const uint64_t* op_properties); + + // Obtain the name of a state given its type. + static const std::string& GetStateName(StateType state_type); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/threadpool.h b/src/rocksdb/include/rocksdb/threadpool.h new file mode 100644 index 000000000..b39321fe8 --- /dev/null +++ b/src/rocksdb/include/rocksdb/threadpool.h @@ -0,0 +1,58 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include <functional> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +/* + * ThreadPool is a component that will spawn N background threads that will + * be used to execute scheduled work, The number of background threads could + * be modified by calling SetBackgroundThreads(). + * */ +class ThreadPool { + public: + virtual ~ThreadPool() {} + + // Wait for all threads to finish. + // Discard those threads that did not start + // executing + virtual void JoinAllThreads() = 0; + + // Set the number of background threads that will be executing the + // scheduled jobs. + virtual void SetBackgroundThreads(int num) = 0; + virtual int GetBackgroundThreads() = 0; + + // Get the number of jobs scheduled in the ThreadPool queue. + virtual unsigned int GetQueueLen() const = 0; + + // Waits for all jobs to complete those + // that already started running and those that did not + // start yet. This ensures that everything that was thrown + // on the TP runs even though + // we may not have specified enough threads for the amount + // of jobs + virtual void WaitForJobsAndJoinAllThreads() = 0; + + // Submit a fire and forget jobs + // This allows to submit the same job multiple times + virtual void SubmitJob(const std::function<void()>&) = 0; + // This moves the function in for efficiency + virtual void SubmitJob(std::function<void()>&&) = 0; +}; + +// NewThreadPool() is a function that could be used to create a ThreadPool +// with `num_threads` background threads. +extern ThreadPool* NewThreadPool(int num_threads); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/trace_reader_writer.h b/src/rocksdb/include/rocksdb/trace_reader_writer.h new file mode 100644 index 000000000..d58ed47b2 --- /dev/null +++ b/src/rocksdb/include/rocksdb/trace_reader_writer.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +// Allow custom implementations of TraceWriter and TraceReader. +// By default, RocksDB provides a way to capture the traces to a file using the +// factory NewFileTraceWriter(). But users could also choose to export traces to +// any other system by providing custom implementations of TraceWriter and +// TraceReader. + +// TraceWriter allows exporting RocksDB traces to any system, one operation at +// a time. +class TraceWriter { + public: + TraceWriter() {} + virtual ~TraceWriter() {} + + virtual Status Write(const Slice& data) = 0; + virtual Status Close() = 0; + virtual uint64_t GetFileSize() = 0; +}; + +// TraceReader allows reading RocksDB traces from any system, one operation at +// a time. A RocksDB Replayer could depend on this to replay opertions. +class TraceReader { + public: + TraceReader() {} + virtual ~TraceReader() {} + + virtual Status Read(std::string* data) = 0; + virtual Status Close() = 0; +}; + +// Factory methods to read/write traces from/to a file. +Status NewFileTraceWriter(Env* env, const EnvOptions& env_options, + const std::string& trace_filename, + std::unique_ptr<TraceWriter>* trace_writer); +Status NewFileTraceReader(Env* env, const EnvOptions& env_options, + const std::string& trace_filename, + std::unique_ptr<TraceReader>* trace_reader); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/transaction_log.h b/src/rocksdb/include/rocksdb/transaction_log.h new file mode 100644 index 000000000..48d0e5c0b --- /dev/null +++ b/src/rocksdb/include/rocksdb/transaction_log.h @@ -0,0 +1,121 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <memory> +#include <vector> +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" + +namespace ROCKSDB_NAMESPACE { + +class LogFile; +typedef std::vector<std::unique_ptr<LogFile>> VectorLogPtr; + +enum WalFileType { + /* Indicates that WAL file is in archive directory. WAL files are moved from + * the main db directory to archive directory once they are not live and stay + * there until cleaned up. Files are cleaned depending on archive size + * (Options::WAL_size_limit_MB) and time since last cleaning + * (Options::WAL_ttl_seconds). + */ + kArchivedLogFile = 0, + + /* Indicates that WAL file is live and resides in the main db directory */ + kAliveLogFile = 1 +}; + +class LogFile { + public: + LogFile() {} + virtual ~LogFile() {} + + // Returns log file's pathname relative to the main db dir + // Eg. For a live-log-file = /000003.log + // For an archived-log-file = /archive/000003.log + virtual std::string PathName() const = 0; + + // Primary identifier for log file. + // This is directly proportional to creation time of the log file + virtual uint64_t LogNumber() const = 0; + + // Log file can be either alive or archived + virtual WalFileType Type() const = 0; + + // Starting sequence number of writebatch written in this log file + virtual SequenceNumber StartSequence() const = 0; + + // Size of log file on disk in Bytes + virtual uint64_t SizeFileBytes() const = 0; +}; + +struct BatchResult { + SequenceNumber sequence = 0; + std::unique_ptr<WriteBatch> writeBatchPtr; + + // Add empty __ctor and __dtor for the rule of five + // However, preserve the original semantics and prohibit copying + // as the std::unique_ptr member does not copy. + BatchResult() {} + + ~BatchResult() {} + + BatchResult(const BatchResult&) = delete; + + BatchResult& operator=(const BatchResult&) = delete; + + BatchResult(BatchResult&& bResult) + : sequence(std::move(bResult.sequence)), + writeBatchPtr(std::move(bResult.writeBatchPtr)) {} + + BatchResult& operator=(BatchResult&& bResult) { + sequence = std::move(bResult.sequence); + writeBatchPtr = std::move(bResult.writeBatchPtr); + return *this; + } +}; + +// A TransactionLogIterator is used to iterate over the transactions in a db. +// One run of the iterator is continuous, i.e. the iterator will stop at the +// beginning of any gap in sequences +class TransactionLogIterator { + public: + TransactionLogIterator() {} + virtual ~TransactionLogIterator() {} + + // An iterator is either positioned at a WriteBatch or not valid. + // This method returns true if the iterator is valid. + // Can read data from a valid iterator. + virtual bool Valid() = 0; + + // Moves the iterator to the next WriteBatch. + // REQUIRES: Valid() to be true. + virtual void Next() = 0; + + // Returns ok if the iterator is valid. + // Returns the Error when something has gone wrong. + virtual Status status() = 0; + + // If valid return's the current write_batch and the sequence number of the + // earliest transaction contained in the batch. + // ONLY use if Valid() is true and status() is OK. + virtual BatchResult GetBatch() = 0; + + // The read options for TransactionLogIterator. + struct ReadOptions { + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: true + bool verify_checksums_; + + ReadOptions() : verify_checksums_(true) {} + + explicit ReadOptions(bool verify_checksums) + : verify_checksums_(verify_checksums) {} + }; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/types.h b/src/rocksdb/include/rocksdb/types.h new file mode 100644 index 000000000..4d004b69d --- /dev/null +++ b/src/rocksdb/include/rocksdb/types.h @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <stdint.h> +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +// Define all public custom types here. + +// Represents a sequence number in a WAL file. +typedef uint64_t SequenceNumber; + +const SequenceNumber kMinUnCommittedSeq = 1; // 0 is always committed + +// User-oriented representation of internal key types. +enum EntryType { + kEntryPut, + kEntryDelete, + kEntrySingleDelete, + kEntryMerge, + kEntryRangeDeletion, + kEntryBlobIndex, + kEntryOther, +}; + +// <user key, sequence number, and entry type> tuple. +struct FullKey { + Slice user_key; + SequenceNumber sequence; + EntryType type; + + FullKey() : sequence(0) {} // Intentionally left uninitialized (for speed) + FullKey(const Slice& u, const SequenceNumber& seq, EntryType t) + : user_key(u), sequence(seq), type(t) {} + std::string DebugString(bool hex = false) const; + + void clear() { + user_key.clear(); + sequence = 0; + type = EntryType::kEntryPut; + } +}; + +// Parse slice representing internal key to FullKey +// Parsed FullKey is valid for as long as the memory pointed to by +// internal_key is alive. +bool ParseFullKey(const Slice& internal_key, FullKey* result); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/universal_compaction.h b/src/rocksdb/include/rocksdb/universal_compaction.h new file mode 100644 index 000000000..e3aeee6ce --- /dev/null +++ b/src/rocksdb/include/rocksdb/universal_compaction.h @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <stdint.h> +#include <climits> +#include <vector> + +namespace ROCKSDB_NAMESPACE { + +// +// Algorithm used to make a compaction request stop picking new files +// into a single compaction run +// +enum CompactionStopStyle { + kCompactionStopStyleSimilarSize, // pick files of similar size + kCompactionStopStyleTotalSize // total size of picked files > next file +}; + +class CompactionOptionsUniversal { + public: + // Percentage flexibility while comparing file size. If the candidate file(s) + // size is 1% smaller than the next file's size, then include next file into + // this candidate set. // Default: 1 + unsigned int size_ratio; + + // The minimum number of files in a single compaction run. Default: 2 + unsigned int min_merge_width; + + // The maximum number of files in a single compaction run. Default: UINT_MAX + unsigned int max_merge_width; + + // The size amplification is defined as the amount (in percentage) of + // additional storage needed to store a single byte of data in the database. + // For example, a size amplification of 2% means that a database that + // contains 100 bytes of user-data may occupy upto 102 bytes of + // physical storage. By this definition, a fully compacted database has + // a size amplification of 0%. Rocksdb uses the following heuristic + // to calculate size amplification: it assumes that all files excluding + // the earliest file contribute to the size amplification. + // Default: 200, which means that a 100 byte database could require upto + // 300 bytes of storage. + unsigned int max_size_amplification_percent; + + // If this option is set to be -1 (the default value), all the output files + // will follow compression type specified. + // + // If this option is not negative, we will try to make sure compressed + // size is just above this value. In normal cases, at least this percentage + // of data will be compressed. + // When we are compacting to a new file, here is the criteria whether + // it needs to be compressed: assuming here are the list of files sorted + // by generation time: + // A1...An B1...Bm C1...Ct + // where A1 is the newest and Ct is the oldest, and we are going to compact + // B1...Bm, we calculate the total size of all the files as total_size, as + // well as the total size of C1...Ct as total_C, the compaction output file + // will be compressed iff + // total_C / total_size < this percentage + // Default: -1 + int compression_size_percent; + + // The algorithm used to stop picking files into a single compaction run + // Default: kCompactionStopStyleTotalSize + CompactionStopStyle stop_style; + + // Option to optimize the universal multi level compaction by enabling + // trivial move for non overlapping files. + // Default: false + bool allow_trivial_move; + + // Default set of parameters + CompactionOptionsUniversal() + : size_ratio(1), + min_merge_width(2), + max_merge_width(UINT_MAX), + max_size_amplification_percent(200), + compression_size_percent(-1), + stop_style(kCompactionStopStyleTotalSize), + allow_trivial_move(false) {} +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/backupable_db.h b/src/rocksdb/include/rocksdb/utilities/backupable_db.h new file mode 100644 index 000000000..f281ed133 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/backupable_db.h @@ -0,0 +1,341 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#include <cinttypes> +#include <functional> +#include <map> +#include <string> +#include <vector> + +#include "rocksdb/utilities/stackable_db.h" + +#include "rocksdb/env.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +struct BackupableDBOptions { + // Where to keep the backup files. Has to be different than dbname_ + // Best to set this to dbname_ + "/backups" + // Required + std::string backup_dir; + + // Backup Env object. It will be used for backup file I/O. If it's + // nullptr, backups will be written out using DBs Env. If it's + // non-nullptr, backup's I/O will be performed using this object. + // If you want to have backups on HDFS, use HDFS Env here! + // Default: nullptr + Env* backup_env; + + // If share_table_files == true, backup will assume that table files with + // same name have the same contents. This enables incremental backups and + // avoids unnecessary data copies. + // If share_table_files == false, each backup will be on its own and will + // not share any data with other backups. + // default: true + bool share_table_files; + + // Backup info and error messages will be written to info_log + // if non-nullptr. + // Default: nullptr + Logger* info_log; + + // If sync == true, we can guarantee you'll get consistent backup even + // on a machine crash/reboot. Backup process is slower with sync enabled. + // If sync == false, we don't guarantee anything on machine reboot. However, + // chances are some of the backups are consistent. + // Default: true + bool sync; + + // If true, it will delete whatever backups there are already + // Default: false + bool destroy_old_data; + + // If false, we won't backup log files. This option can be useful for backing + // up in-memory databases where log file are persisted, but table files are in + // memory. + // Default: true + bool backup_log_files; + + // Max bytes that can be transferred in a second during backup. + // If 0, go as fast as you can + // Default: 0 + uint64_t backup_rate_limit; + + // Backup rate limiter. Used to control transfer speed for backup. If this is + // not null, backup_rate_limit is ignored. + // Default: nullptr + std::shared_ptr<RateLimiter> backup_rate_limiter{nullptr}; + + // Max bytes that can be transferred in a second during restore. + // If 0, go as fast as you can + // Default: 0 + uint64_t restore_rate_limit; + + // Restore rate limiter. Used to control transfer speed during restore. If + // this is not null, restore_rate_limit is ignored. + // Default: nullptr + std::shared_ptr<RateLimiter> restore_rate_limiter{nullptr}; + + // Only used if share_table_files is set to true. If true, will consider that + // backups can come from different databases, hence a sst is not uniquely + // identifed by its name, but by the triple (file name, crc32, file length) + // Default: false + // Note: this is an experimental option, and you'll need to set it manually + // *turn it on only if you know what you're doing* + bool share_files_with_checksum; + + // Up to this many background threads will copy files for CreateNewBackup() + // and RestoreDBFromBackup() + // Default: 1 + int max_background_operations; + + // During backup user can get callback every time next + // callback_trigger_interval_size bytes being copied. + // Default: 4194304 + uint64_t callback_trigger_interval_size; + + // For BackupEngineReadOnly, Open() will open at most this many of the + // latest non-corrupted backups. + // + // Note: this setting is ignored (behaves like INT_MAX) for any kind of + // writable BackupEngine because it would inhibit accounting for shared + // files for proper backup deletion, including purging any incompletely + // created backups on creation of a new backup. + // + // Default: INT_MAX + int max_valid_backups_to_open; + + void Dump(Logger* logger) const; + + explicit BackupableDBOptions( + const std::string& _backup_dir, Env* _backup_env = nullptr, + bool _share_table_files = true, Logger* _info_log = nullptr, + bool _sync = true, bool _destroy_old_data = false, + bool _backup_log_files = true, uint64_t _backup_rate_limit = 0, + uint64_t _restore_rate_limit = 0, int _max_background_operations = 1, + uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024, + int _max_valid_backups_to_open = INT_MAX) + : backup_dir(_backup_dir), + backup_env(_backup_env), + share_table_files(_share_table_files), + info_log(_info_log), + sync(_sync), + destroy_old_data(_destroy_old_data), + backup_log_files(_backup_log_files), + backup_rate_limit(_backup_rate_limit), + restore_rate_limit(_restore_rate_limit), + share_files_with_checksum(false), + max_background_operations(_max_background_operations), + callback_trigger_interval_size(_callback_trigger_interval_size), + max_valid_backups_to_open(_max_valid_backups_to_open) { + assert(share_table_files || !share_files_with_checksum); + } +}; + +struct RestoreOptions { + // If true, restore won't overwrite the existing log files in wal_dir. It will + // also move all log files from archive directory to wal_dir. Use this option + // in combination with BackupableDBOptions::backup_log_files = false for + // persisting in-memory databases. + // Default: false + bool keep_log_files; + + explicit RestoreOptions(bool _keep_log_files = false) + : keep_log_files(_keep_log_files) {} +}; + +typedef uint32_t BackupID; + +struct BackupInfo { + BackupID backup_id; + int64_t timestamp; + uint64_t size; + + uint32_t number_files; + std::string app_metadata; + + BackupInfo() {} + + BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size, + uint32_t _number_files, const std::string& _app_metadata) + : backup_id(_backup_id), + timestamp(_timestamp), + size(_size), + number_files(_number_files), + app_metadata(_app_metadata) {} +}; + +class BackupStatistics { + public: + BackupStatistics() { + number_success_backup = 0; + number_fail_backup = 0; + } + + BackupStatistics(uint32_t _number_success_backup, + uint32_t _number_fail_backup) + : number_success_backup(_number_success_backup), + number_fail_backup(_number_fail_backup) {} + + ~BackupStatistics() {} + + void IncrementNumberSuccessBackup(); + void IncrementNumberFailBackup(); + + uint32_t GetNumberSuccessBackup() const; + uint32_t GetNumberFailBackup() const; + + std::string ToString() const; + + private: + uint32_t number_success_backup; + uint32_t number_fail_backup; +}; + +// A backup engine for accessing information about backups and restoring from +// them. +class BackupEngineReadOnly { + public: + virtual ~BackupEngineReadOnly() {} + + static Status Open(Env* db_env, const BackupableDBOptions& options, + BackupEngineReadOnly** backup_engine_ptr); + + // Returns info about backups in backup_info + // You can GetBackupInfo safely, even with other BackupEngine performing + // backups on the same directory + virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0; + + // Returns info about corrupt backups in corrupt_backups + virtual void GetCorruptedBackups( + std::vector<BackupID>* corrupt_backup_ids) = 0; + + // Restoring DB from backup is NOT safe when there is another BackupEngine + // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's + // responsibility to synchronize the operation, i.e. don't delete the backup + // when you're restoring from it + // See also the corresponding doc in BackupEngine + virtual Status RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; + + // See the corresponding doc in BackupEngine + virtual Status RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; + + // checks that each file exists and that the size of the file matches our + // expectations. it does not check file checksum. + // + // If this BackupEngine created the backup, it compares the files' current + // sizes against the number of bytes written to them during creation. + // Otherwise, it compares the files' current sizes against their sizes when + // the BackupEngine was opened. + // + // Returns Status::OK() if all checks are good + virtual Status VerifyBackup(BackupID backup_id) = 0; +}; + +// A backup engine for creating new backups. +class BackupEngine { + public: + virtual ~BackupEngine() {} + + // BackupableDBOptions have to be the same as the ones used in previous + // BackupEngines for the same backup directory. + static Status Open(Env* db_env, const BackupableDBOptions& options, + BackupEngine** backup_engine_ptr); + + // same as CreateNewBackup, but stores extra application metadata + // Flush will always trigger if 2PC is enabled. + // If write-ahead logs are disabled, set flush_before_backup=true to + // avoid losing unflushed key/value pairs from the memtable. + virtual Status CreateNewBackupWithMetadata( + DB* db, const std::string& app_metadata, bool flush_before_backup = false, + std::function<void()> progress_callback = []() {}) = 0; + + // Captures the state of the database in the latest backup + // NOT a thread safe call + // Flush will always trigger if 2PC is enabled. + // If write-ahead logs are disabled, set flush_before_backup=true to + // avoid losing unflushed key/value pairs from the memtable. + virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false, + std::function<void()> progress_callback = + []() {}) { + return CreateNewBackupWithMetadata(db, "", flush_before_backup, + progress_callback); + } + + // Deletes old backups, keeping latest num_backups_to_keep alive. + // See also DeleteBackup. + virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0; + + // Deletes a specific backup. If this operation (or PurgeOldBackups) + // is not completed due to crash, power failure, etc. the state + // will be cleaned up the next time you call DeleteBackup, + // PurgeOldBackups, or GarbageCollect. + virtual Status DeleteBackup(BackupID backup_id) = 0; + + // Call this from another thread if you want to stop the backup + // that is currently happening. It will return immediatelly, will + // not wait for the backup to stop. + // The backup will stop ASAP and the call to CreateNewBackup will + // return Status::Incomplete(). It will not clean up after itself, but + // the state will remain consistent. The state will be cleaned up the + // next time you call CreateNewBackup or GarbageCollect. + virtual void StopBackup() = 0; + + // Returns info about backups in backup_info + virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0; + + // Returns info about corrupt backups in corrupt_backups + virtual void GetCorruptedBackups( + std::vector<BackupID>* corrupt_backup_ids) = 0; + + // restore from backup with backup_id + // IMPORTANT -- if options_.share_table_files == true, + // options_.share_files_with_checksum == false, you restore DB from some + // backup that is not the latest, and you start creating new backups from the + // new DB, they will probably fail. + // + // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3. + // If you add new data to the DB and try creating a new backup now, the + // database will diverge from backups 4 and 5 and the new backup will fail. + // If you want to create new backup, you will first have to delete backups 4 + // and 5. + virtual Status RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; + + // restore from the latest backup + virtual Status RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; + + // checks that each file exists and that the size of the file matches our + // expectations. it does not check file checksum. + // Returns Status::OK() if all checks are good + virtual Status VerifyBackup(BackupID backup_id) = 0; + + // Will delete any files left over from incomplete creation or deletion of + // a backup. This is not normally needed as those operations also clean up + // after prior incomplete calls to the same kind of operation (create or + // delete). + // NOTE: This is not designed to delete arbitrary files added to the backup + // directory outside of BackupEngine, and clean-up is always subject to + // permissions on and availability of the underlying filesystem. + virtual Status GarbageCollect() = 0; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/checkpoint.h b/src/rocksdb/include/rocksdb/utilities/checkpoint.h new file mode 100644 index 000000000..c7f93b4cf --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/checkpoint.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// A checkpoint is an openable snapshot of a database at a point in time. + +#pragma once +#ifndef ROCKSDB_LITE + +#include <string> +#include <vector> +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class DB; +class ColumnFamilyHandle; +struct LiveFileMetaData; +struct ExportImportFilesMetaData; + +class Checkpoint { + public: + // Creates a Checkpoint object to be used for creating openable snapshots + static Status Create(DB* db, Checkpoint** checkpoint_ptr); + + // Builds an openable snapshot of RocksDB on the same disk, which + // accepts an output directory on the same disk, and under the directory + // (1) hard-linked SST files pointing to existing live SST files + // SST files will be copied if output directory is on a different filesystem + // (2) a copied manifest files and other files + // The directory should not already exist and will be created by this API. + // The directory will be an absolute path + // log_size_for_flush: if the total log file size is equal or larger than + // this value, then a flush is triggered for all the column families. The + // default value is 0, which means flush is always triggered. If you move + // away from the default, the checkpoint may not contain up-to-date data + // if WAL writing is not always enabled. + // Flush will always trigger if it is 2PC. + virtual Status CreateCheckpoint(const std::string& checkpoint_dir, + uint64_t log_size_for_flush = 0); + + // Exports all live SST files of a specified Column Family onto export_dir, + // returning SST files information in metadata. + // - SST files will be created as hard links when the directory specified + // is in the same partition as the db directory, copied otherwise. + // - export_dir should not already exist and will be created by this API. + // - Always triggers a flush. + virtual Status ExportColumnFamily(ColumnFamilyHandle* handle, + const std::string& export_dir, + ExportImportFilesMetaData** metadata); + + virtual ~Checkpoint() {} +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/convenience.h b/src/rocksdb/include/rocksdb/utilities/convenience.h new file mode 100644 index 000000000..f61afd69e --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/convenience.h @@ -0,0 +1,10 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +// This file was moved to rocksdb/convenience.h" + +#include "rocksdb/convenience.h" diff --git a/src/rocksdb/include/rocksdb/utilities/db_ttl.h b/src/rocksdb/include/rocksdb/utilities/db_ttl.h new file mode 100644 index 000000000..dd83cb24b --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/db_ttl.h @@ -0,0 +1,72 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <string> +#include <vector> + +#include "rocksdb/db.h" +#include "rocksdb/utilities/stackable_db.h" + +namespace ROCKSDB_NAMESPACE { + +// Database with TTL support. +// +// USE-CASES: +// This API should be used to open the db when key-values inserted are +// meant to be removed from the db in a non-strict 'ttl' amount of time +// Therefore, this guarantees that key-values inserted will remain in the +// db for >= ttl amount of time and the db will make efforts to remove the +// key-values as soon as possible after ttl seconds of their insertion. +// +// BEHAVIOUR: +// TTL is accepted in seconds +// (int32_t)Timestamp(creation) is suffixed to values in Put internally +// Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now) +// Get/Iterator may return expired entries(compaction not run on them yet) +// Different TTL may be used during different Opens +// Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2 +// Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5 +// read_only=true opens in the usual read-only mode. Compactions will not be +// triggered(neither manual nor automatic), so no expired entries removed +// +// CONSTRAINTS: +// Not specifying/passing or non-positive TTL behaves like TTL = infinity +// +// !!!WARNING!!!: +// Calling DB::Open directly to re-open a db created by this API will get +// corrupt values(timestamp suffixed) and no ttl effect will be there +// during the second Open, so use this API consistently to open the db +// Be careful when passing ttl with a small positive value because the +// whole database may be deleted in a small amount of time + +class DBWithTTL : public StackableDB { + public: + virtual Status CreateColumnFamilyWithTtl( + const ColumnFamilyOptions& options, const std::string& column_family_name, + ColumnFamilyHandle** handle, int ttl) = 0; + + static Status Open(const Options& options, const std::string& dbname, + DBWithTTL** dbptr, int32_t ttl = 0, + bool read_only = false); + + static Status Open(const DBOptions& db_options, const std::string& dbname, + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles, + DBWithTTL** dbptr, std::vector<int32_t> ttls, + bool read_only = false); + + virtual void SetTtl(int32_t ttl) = 0; + + virtual void SetTtl(ColumnFamilyHandle* h, int32_t ttl) = 0; + + protected: + explicit DBWithTTL(DB* db) : StackableDB(db) {} +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/debug.h b/src/rocksdb/include/rocksdb/utilities/debug.h new file mode 100644 index 000000000..a2b6adcb0 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/debug.h @@ -0,0 +1,49 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include "rocksdb/db.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// Data associated with a particular version of a key. A database may internally +// store multiple versions of a same user key due to snapshots, compaction not +// happening yet, etc. +struct KeyVersion { + KeyVersion() : user_key(""), value(""), sequence(0), type(0) {} + + KeyVersion(const std::string& _user_key, const std::string& _value, + SequenceNumber _sequence, int _type) + : user_key(_user_key), value(_value), sequence(_sequence), type(_type) {} + + std::string user_key; + std::string value; + SequenceNumber sequence; + // TODO(ajkr): we should provide a helper function that converts the int to a + // string describing the type for easier debugging. + int type; +}; + +// Returns listing of all versions of keys in the provided user key range. +// The range is inclusive-inclusive, i.e., [`begin_key`, `end_key`], or +// `max_num_ikeys` has been reached. Since all those keys returned will be +// copied to memory, if the range covers too many keys, the memory usage +// may be huge. `max_num_ikeys` can be used to cap the memory usage. +// The result is inserted into the provided vector, `key_versions`. +Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key, + size_t max_num_ikeys, + std::vector<KeyVersion>* key_versions); + +Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key, + Slice end_key, size_t max_num_ikeys, + std::vector<KeyVersion>* key_versions); + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/env_librados.h b/src/rocksdb/include/rocksdb/utilities/env_librados.h new file mode 100644 index 000000000..361217c62 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/env_librados.h @@ -0,0 +1,175 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <memory> +#include <string> + +#include "rocksdb/status.h" +#include "rocksdb/utilities/env_mirror.h" + +#include <rados/librados.hpp> + +namespace ROCKSDB_NAMESPACE { +class LibradosWritableFile; + +class EnvLibrados : public EnvWrapper { + public: + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores nullptr in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + Status NewSequentialFile(const std::string& fname, + std::unique_ptr<SequentialFile>* result, + const EnvOptions& options) override; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr<RandomAccessFile>* result, + const EnvOptions& options) override; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + Status NewWritableFile(const std::string& fname, + std::unique_ptr<WritableFile>* result, + const EnvOptions& options) override; + + // Reuse an existing file by renaming it and opening it as writable. + Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr<WritableFile>* result, + const EnvOptions& options) override; + + // Create an object that represents a directory. Will fail if directory + // doesn't exist. If the directory exists, it will open the directory + // and create a new Directory object. + // + // On success, stores a pointer to the new Directory in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + Status NewDirectory(const std::string& name, + std::unique_ptr<Directory>* result) override; + + // Returns OK if the named file exists. + // NotFound if the named file does not exist, + // the calling process does not have permission to determine + // whether this file exists, or if the path is invalid. + // IOError if an IO Error was encountered + Status FileExists(const std::string& fname) override; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + Status GetChildren(const std::string& dir, std::vector<std::string>* result); + + // Delete the named file. + Status DeleteFile(const std::string& fname) override; + + // Create the specified directory. Returns error if directory exists. + Status CreateDir(const std::string& dirname) override; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + Status CreateDirIfMissing(const std::string& dirname) override; + + // Delete the specified directory. + Status DeleteDir(const std::string& dirname) override; + + // Store the size of fname in *file_size. + Status GetFileSize(const std::string& fname, uint64_t* file_size) override; + + // Store the last modification time of fname in *file_mtime. + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) override; + // Rename file src to target. + Status RenameFile(const std::string& src, const std::string& target) override; + // Hard Link file src to target. + Status LinkFile(const std::string& src, const std::string& target) override; + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores nullptr in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + Status LockFile(const std::string& fname, FileLock** lock); + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + Status UnlockFile(FileLock* lock); + + // Get full directory name for this db. + Status GetAbsolutePath(const std::string& db_path, std::string* output_path); + + // Generate unique id + std::string GenerateUniqueId(); + + // Get default EnvLibrados + static EnvLibrados* Default(); + + explicit EnvLibrados(const std::string& db_name, + const std::string& config_path, + const std::string& db_pool); + + explicit EnvLibrados( + const std::string& client_name, // first 3 parameters are + // for RADOS client init + const std::string& cluster_name, const uint64_t flags, + const std::string& db_name, const std::string& config_path, + const std::string& db_pool, const std::string& wal_dir, + const std::string& wal_pool, const uint64_t write_buffer_size); + ~EnvLibrados() { _rados.shutdown(); } + + private: + std::string _client_name; + std::string _cluster_name; + uint64_t _flags; + std::string _db_name; // get from user, readable string; Also used as db_id + // for db metadata + std::string _config_path; + librados::Rados _rados; // RADOS client + std::string _db_pool_name; + librados::IoCtx _db_pool_ioctx; // IoCtx for connecting db_pool + std::string _wal_dir; // WAL dir path + std::string _wal_pool_name; + librados::IoCtx _wal_pool_ioctx; // IoCtx for connecting wal_pool + uint64_t _write_buffer_size; // WritableFile buffer max size + + /* private function to communicate with rados */ + std::string _CreateFid(); + Status _GetFid(const std::string& fname, std::string& fid); + Status _GetFid(const std::string& fname, std::string& fid, int fid_len); + Status _RenameFid(const std::string& old_fname, const std::string& new_fname); + Status _AddFid(const std::string& fname, const std::string& fid); + Status _DelFid(const std::string& fname); + Status _GetSubFnames(const std::string& dirname, + std::vector<std::string>* result); + librados::IoCtx* _GetIoctx(const std::string& prefix); + friend class LibradosWritableFile; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/env_mirror.h b/src/rocksdb/include/rocksdb/utilities/env_mirror.h new file mode 100644 index 000000000..8e96ac410 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/env_mirror.h @@ -0,0 +1,180 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2015, Red Hat, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// MirrorEnv is an Env implementation that mirrors all file-related +// operations to two backing Env's (provided at construction time). +// Writes are mirrored. For read operations, we do the read from both +// backends and assert that the results match. +// +// This is useful when implementing a new Env and ensuring that the +// semantics and behavior are correct (in that they match that of an +// existing, stable Env, like the default POSIX one). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <algorithm> +#include <iostream> +#include <vector> +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +class SequentialFileMirror; +class RandomAccessFileMirror; +class WritableFileMirror; + +class EnvMirror : public EnvWrapper { + Env *a_, *b_; + bool free_a_, free_b_; + + public: + EnvMirror(Env* a, Env* b, bool free_a = false, bool free_b = false) + : EnvWrapper(a), a_(a), b_(b), free_a_(free_a), free_b_(free_b) {} + ~EnvMirror() { + if (free_a_) delete a_; + if (free_b_) delete b_; + } + + Status NewSequentialFile(const std::string& f, + std::unique_ptr<SequentialFile>* r, + const EnvOptions& options) override; + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr<RandomAccessFile>* r, + const EnvOptions& options) override; + Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r, + const EnvOptions& options) override; + Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr<WritableFile>* r, + const EnvOptions& options) override; + virtual Status NewDirectory(const std::string& name, + std::unique_ptr<Directory>* result) override { + std::unique_ptr<Directory> br; + Status as = a_->NewDirectory(name, result); + Status bs = b_->NewDirectory(name, &br); + assert(as == bs); + return as; + } + Status FileExists(const std::string& f) override { + Status as = a_->FileExists(f); + Status bs = b_->FileExists(f); + assert(as == bs); + return as; + } +#if defined(_MSC_VER) +#pragma warning(push) +// logical operation on address of string constant +#pragma warning(disable : 4130) +#endif + Status GetChildren(const std::string& dir, + std::vector<std::string>* r) override { + std::vector<std::string> ar, br; + Status as = a_->GetChildren(dir, &ar); + Status bs = b_->GetChildren(dir, &br); + assert(as == bs); + std::sort(ar.begin(), ar.end()); + std::sort(br.begin(), br.end()); + if (!as.ok() || ar != br) { + assert(0 == "getchildren results don't match"); + } + *r = ar; + return as; + } +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + Status DeleteFile(const std::string& f) override { + Status as = a_->DeleteFile(f); + Status bs = b_->DeleteFile(f); + assert(as == bs); + return as; + } + Status CreateDir(const std::string& d) override { + Status as = a_->CreateDir(d); + Status bs = b_->CreateDir(d); + assert(as == bs); + return as; + } + Status CreateDirIfMissing(const std::string& d) override { + Status as = a_->CreateDirIfMissing(d); + Status bs = b_->CreateDirIfMissing(d); + assert(as == bs); + return as; + } + Status DeleteDir(const std::string& d) override { + Status as = a_->DeleteDir(d); + Status bs = b_->DeleteDir(d); + assert(as == bs); + return as; + } + Status GetFileSize(const std::string& f, uint64_t* s) override { + uint64_t asize, bsize; + Status as = a_->GetFileSize(f, &asize); + Status bs = b_->GetFileSize(f, &bsize); + assert(as == bs); + assert(!as.ok() || asize == bsize); + *s = asize; + return as; + } + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) override { + uint64_t amtime, bmtime; + Status as = a_->GetFileModificationTime(fname, &amtime); + Status bs = b_->GetFileModificationTime(fname, &bmtime); + assert(as == bs); + assert(!as.ok() || amtime - bmtime < 10000 || bmtime - amtime < 10000); + *file_mtime = amtime; + return as; + } + + Status RenameFile(const std::string& s, const std::string& t) override { + Status as = a_->RenameFile(s, t); + Status bs = b_->RenameFile(s, t); + assert(as == bs); + return as; + } + + Status LinkFile(const std::string& s, const std::string& t) override { + Status as = a_->LinkFile(s, t); + Status bs = b_->LinkFile(s, t); + assert(as == bs); + return as; + } + + class FileLockMirror : public FileLock { + public: + FileLock *a_, *b_; + FileLockMirror(FileLock* a, FileLock* b) : a_(a), b_(b) {} + }; + + Status LockFile(const std::string& f, FileLock** l) override { + FileLock *al, *bl; + Status as = a_->LockFile(f, &al); + Status bs = b_->LockFile(f, &bl); + assert(as == bs); + if (as.ok()) *l = new FileLockMirror(al, bl); + return as; + } + + Status UnlockFile(FileLock* l) override { + FileLockMirror* ml = static_cast<FileLockMirror*>(l); + Status as = a_->UnlockFile(ml->a_); + Status bs = b_->UnlockFile(ml->b_); + assert(as == bs); + delete ml; + return as; + } +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/info_log_finder.h b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h new file mode 100644 index 000000000..824f8a3df --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <string> +#include <vector> + +#include "rocksdb/db.h" +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { + +// This function can be used to list the Information logs, +// given the db pointer. +Status GetInfoLogList(DB* db, std::vector<std::string>* info_log_list); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h b/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h new file mode 100644 index 000000000..94548b538 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h @@ -0,0 +1,277 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#ifndef ROCKSDB_LITE + +#include <stdio.h> +#include <stdlib.h> +#include <algorithm> +#include <functional> +#include <map> +#include <sstream> +#include <string> +#include <vector> + +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/ldb_tool.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/ldb_cmd_execute_result.h" + +namespace ROCKSDB_NAMESPACE { + +class LDBCommand { + public: + // Command-line arguments + static const std::string ARG_ENV_URI; + static const std::string ARG_DB; + static const std::string ARG_PATH; + static const std::string ARG_SECONDARY_PATH; + static const std::string ARG_HEX; + static const std::string ARG_KEY_HEX; + static const std::string ARG_VALUE_HEX; + static const std::string ARG_CF_NAME; + static const std::string ARG_TTL; + static const std::string ARG_TTL_START; + static const std::string ARG_TTL_END; + static const std::string ARG_TIMESTAMP; + static const std::string ARG_TRY_LOAD_OPTIONS; + static const std::string ARG_IGNORE_UNKNOWN_OPTIONS; + static const std::string ARG_FROM; + static const std::string ARG_TO; + static const std::string ARG_MAX_KEYS; + static const std::string ARG_BLOOM_BITS; + static const std::string ARG_FIX_PREFIX_LEN; + static const std::string ARG_COMPRESSION_TYPE; + static const std::string ARG_COMPRESSION_MAX_DICT_BYTES; + static const std::string ARG_BLOCK_SIZE; + static const std::string ARG_AUTO_COMPACTION; + static const std::string ARG_DB_WRITE_BUFFER_SIZE; + static const std::string ARG_WRITE_BUFFER_SIZE; + static const std::string ARG_FILE_SIZE; + static const std::string ARG_CREATE_IF_MISSING; + static const std::string ARG_NO_VALUE; + + struct ParsedParams { + std::string cmd; + std::vector<std::string> cmd_params; + std::map<std::string, std::string> option_map; + std::vector<std::string> flags; + }; + + static LDBCommand* SelectCommand(const ParsedParams& parsed_parms); + + static LDBCommand* InitFromCmdLineArgs( + const std::vector<std::string>& args, const Options& options, + const LDBOptions& ldb_options, + const std::vector<ColumnFamilyDescriptor>* column_families, + const std::function<LDBCommand*(const ParsedParams&)>& selector = + SelectCommand); + + static LDBCommand* InitFromCmdLineArgs( + int argc, char** argv, const Options& options, + const LDBOptions& ldb_options, + const std::vector<ColumnFamilyDescriptor>* column_families); + + bool ValidateCmdLineOptions(); + + virtual Options PrepareOptionsForOpenDB(); + + virtual void SetDBOptions(Options options) { options_ = options; } + + virtual void SetColumnFamilies( + const std::vector<ColumnFamilyDescriptor>* column_families) { + if (column_families != nullptr) { + column_families_ = *column_families; + } else { + column_families_.clear(); + } + } + + void SetLDBOptions(const LDBOptions& ldb_options) { + ldb_options_ = ldb_options; + } + + const std::map<std::string, std::string>& TEST_GetOptionMap() { + return option_map_; + } + + const std::vector<std::string>& TEST_GetFlags() { return flags_; } + + virtual bool NoDBOpen() { return false; } + + virtual ~LDBCommand() { CloseDB(); } + + /* Run the command, and return the execute result. */ + void Run(); + + virtual void DoCommand() = 0; + + LDBCommandExecuteResult GetExecuteState() { return exec_state_; } + + void ClearPreviousRunState() { exec_state_.Reset(); } + + // Consider using Slice::DecodeHex directly instead if you don't need the + // 0x prefix + static std::string HexToString(const std::string& str); + + // Consider using Slice::ToString(true) directly instead if + // you don't need the 0x prefix + static std::string StringToHex(const std::string& str); + + static const char* DELIM; + + protected: + LDBCommandExecuteResult exec_state_; + std::string env_uri_; + std::string db_path_; + // If empty, open DB as primary. If non-empty, open the DB as secondary + // with this secondary path. When running against a database opened by + // another process, ldb wll leave the source directory completely intact. + std::string secondary_path_; + std::string column_family_name_; + DB* db_; + DBWithTTL* db_ttl_; + std::map<std::string, ColumnFamilyHandle*> cf_handles_; + + /** + * true implies that this command can work if the db is opened in read-only + * mode. + */ + bool is_read_only_; + + /** If true, the key is input/output as hex in get/put/scan/delete etc. */ + bool is_key_hex_; + + /** If true, the value is input/output as hex in get/put/scan/delete etc. */ + bool is_value_hex_; + + /** If true, the value is treated as timestamp suffixed */ + bool is_db_ttl_; + + // If true, the kvs are output with their insert/modify timestamp in a ttl db + bool timestamp_; + + // If true, try to construct options from DB's option files. + bool try_load_options_; + + bool ignore_unknown_options_; + + bool create_if_missing_; + + /** + * Map of options passed on the command-line. + */ + const std::map<std::string, std::string> option_map_; + + /** + * Flags passed on the command-line. + */ + const std::vector<std::string> flags_; + + /** List of command-line options valid for this command */ + const std::vector<std::string> valid_cmd_line_options_; + + /** Shared pointer to underlying environment if applicable **/ + std::shared_ptr<Env> env_guard_; + + bool ParseKeyValue(const std::string& line, std::string* key, + std::string* value, bool is_key_hex, bool is_value_hex); + + LDBCommand(const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags, bool is_read_only, + const std::vector<std::string>& valid_cmd_line_options); + + void OpenDB(); + + void CloseDB(); + + ColumnFamilyHandle* GetCfHandle(); + + static std::string PrintKeyValue(const std::string& key, + const std::string& value, bool is_key_hex, + bool is_value_hex); + + static std::string PrintKeyValue(const std::string& key, + const std::string& value, bool is_hex); + + /** + * Return true if the specified flag is present in the specified flags vector + */ + static bool IsFlagPresent(const std::vector<std::string>& flags, + const std::string& flag) { + return (std::find(flags.begin(), flags.end(), flag) != flags.end()); + } + + static std::string HelpRangeCmdArgs(); + + /** + * A helper function that returns a list of command line options + * used by this command. It includes the common options and the ones + * passed in. + */ + static std::vector<std::string> BuildCmdLineOptions( + std::vector<std::string> options); + + bool ParseIntOption(const std::map<std::string, std::string>& options, + const std::string& option, int& value, + LDBCommandExecuteResult& exec_state); + + bool ParseStringOption(const std::map<std::string, std::string>& options, + const std::string& option, std::string* value); + + /** + * Returns the value of the specified option as a boolean. + * default_val is used if the option is not found in options. + * Throws an exception if the value of the option is not + * "true" or "false" (case insensitive). + */ + bool ParseBooleanOption(const std::map<std::string, std::string>& options, + const std::string& option, bool default_val); + + Options options_; + std::vector<ColumnFamilyDescriptor> column_families_; + LDBOptions ldb_options_; + + private: + /** + * Interpret command line options and flags to determine if the key + * should be input/output in hex. + */ + bool IsKeyHex(const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + /** + * Interpret command line options and flags to determine if the value + * should be input/output in hex. + */ + bool IsValueHex(const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + /** + * Converts val to a boolean. + * val must be either true or false (case insensitive). + * Otherwise an exception is thrown. + */ + bool StringToBool(std::string val); +}; + +class LDBCommandRunner { + public: + static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name); + + // Returns the status code to return. 0 is no error. + static int RunCommand( + int argc, char** argv, Options options, const LDBOptions& ldb_options, + const std::vector<ColumnFamilyDescriptor>* column_families); +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h b/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h new file mode 100644 index 000000000..c837b47f7 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h @@ -0,0 +1,71 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#ifdef FAILED +#undef FAILED +#endif + +namespace ROCKSDB_NAMESPACE { + +class LDBCommandExecuteResult { + public: + enum State { + EXEC_NOT_STARTED = 0, + EXEC_SUCCEED = 1, + EXEC_FAILED = 2, + }; + + LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {} + + LDBCommandExecuteResult(State state, std::string& msg) + : state_(state), message_(msg) {} + + std::string ToString() { + std::string ret; + switch (state_) { + case EXEC_SUCCEED: + break; + case EXEC_FAILED: + ret.append("Failed: "); + break; + case EXEC_NOT_STARTED: + ret.append("Not started: "); + } + if (!message_.empty()) { + ret.append(message_); + } + return ret; + } + + void Reset() { + state_ = EXEC_NOT_STARTED; + message_ = ""; + } + + bool IsSucceed() { return state_ == EXEC_SUCCEED; } + + bool IsNotStarted() { return state_ == EXEC_NOT_STARTED; } + + bool IsFailed() { return state_ == EXEC_FAILED; } + + static LDBCommandExecuteResult Succeed(std::string msg) { + return LDBCommandExecuteResult(EXEC_SUCCEED, msg); + } + + static LDBCommandExecuteResult Failed(std::string msg) { + return LDBCommandExecuteResult(EXEC_FAILED, msg); + } + + private: + State state_; + std::string message_; + + bool operator==(const LDBCommandExecuteResult&); + bool operator!=(const LDBCommandExecuteResult&); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/leveldb_options.h b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h new file mode 100644 index 000000000..e9fef9609 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h @@ -0,0 +1,146 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <stddef.h> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; +class Comparator; +class Env; +class FilterPolicy; +class Logger; +struct Options; +class Snapshot; + +enum CompressionType : unsigned char; + +// Options to control the behavior of a database (passed to +// DB::Open). A LevelDBOptions object can be initialized as though +// it were a LevelDB Options object, and then it can be converted into +// a RocksDB Options object. +struct LevelDBOptions { + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator; + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists; + + // If true, the implementation will do aggressive checking of the + // data it is processing and will stop early if it detects any + // errors. This may have unforeseen ramifications: for example, a + // corruption of one DB entry may cause a large number of entries to + // become unreadable or for the entire DB to become unopenable. + // Default: false + bool paranoid_checks; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. + // Default: Env::Default() + Env* env; + + // Any internal progress/error information generated by the db will + // be written to info_log if it is non-NULL, or to a file stored + // in the same directory as the DB contents if info_log is NULL. + // Default: NULL + Logger* info_log; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to two write buffers may be held in memory at the same time, + // so you may wish to adjust this parameter to control memory usage. + // Also, a larger write buffer will result in a longer recovery time + // the next time the database is opened. + // + // Default: 4MB + size_t write_buffer_size; + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set (budget + // one open file per 2MB of working set). + // + // Default: 1000 + int max_open_files; + + // Control over blocks (user data is stored in a set of blocks, and + // a block is the unit of reading from disk). + + // If non-NULL, use the specified cache for blocks. + // If NULL, leveldb will automatically create and use an 8MB internal cache. + // Default: NULL + Cache* block_cache; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + // + // Default: 4K + size_t block_size; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + // + // Default: 16 + int block_restart_interval; + + // Compress blocks using the specified compression algorithm. This + // parameter can be changed dynamically. + // + // Default: kSnappyCompression, which gives lightweight but fast + // compression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + CompressionType compression; + + // If non-NULL, use the specified filter policy to reduce disk reads. + // Many applications will benefit from passing the result of + // NewBloomFilterPolicy() here. + // + // Default: NULL + const FilterPolicy* filter_policy; + + // Create a LevelDBOptions object with default values for all fields. + LevelDBOptions(); +}; + +// Converts a LevelDBOptions object into a RocksDB Options object. +Options ConvertOptions(const LevelDBOptions& leveldb_options); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h new file mode 100644 index 000000000..f617da02b --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h @@ -0,0 +1,43 @@ +// Copyright (c) 2016, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifdef LUA + +// lua headers +extern "C" { +#include <lauxlib.h> +#include <lua.h> +#include <lualib.h> +} + +namespace ROCKSDB_NAMESPACE { +namespace lua { +// A class that used to define custom C Library that is callable +// from Lua script +class RocksLuaCustomLibrary { + public: + virtual ~RocksLuaCustomLibrary() {} + // The name of the C library. This name will also be used as the table + // (namespace) in Lua that contains the C library. + virtual const char* Name() const = 0; + + // Returns a "static const struct luaL_Reg[]", which includes a list of + // C functions. Note that the last entry of this static array must be + // {nullptr, nullptr} as required by Lua. + // + // More details about how to implement Lua C libraries can be found + // in the official Lua document http://www.lua.org/pil/26.2.html + virtual const struct luaL_Reg* Lib() const = 0; + + // A function that will be called right after the library has been created + // and pushed on the top of the lua_State. This custom setup function + // allows developers to put additional table or constant values inside + // the same table / namespace. + virtual void CustomSetup(lua_State* /*L*/) const {} +}; +} // namespace lua +} // namespace ROCKSDB_NAMESPACE +#endif // LUA diff --git a/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h new file mode 100644 index 000000000..3427b65ef --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h @@ -0,0 +1,55 @@ +// Copyright (c) 2016, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +// lua headers +extern "C" { +#include <lauxlib.h> +#include <lua.h> +#include <lualib.h> +} + +#ifdef LUA +#include <string> +#include <vector> + +#include "rocksdb/utilities/lua/rocks_lua_custom_library.h" + +namespace ROCKSDB_NAMESPACE { +namespace lua { +class LuaStateWrapper { + public: + explicit LuaStateWrapper(const std::string& lua_script) { + lua_state_ = luaL_newstate(); + Init(lua_script, {}); + } + LuaStateWrapper( + const std::string& lua_script, + const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) { + lua_state_ = luaL_newstate(); + Init(lua_script, libraries); + } + lua_State* GetLuaState() const { return lua_state_; } + ~LuaStateWrapper() { lua_close(lua_state_); } + + private: + void Init( + const std::string& lua_script, + const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) { + if (lua_state_) { + luaL_openlibs(lua_state_); + for (const auto& library : libraries) { + luaL_openlib(lua_state_, library->Name(), library->Lib(), 0); + library->CustomSetup(lua_state_); + } + luaL_dostring(lua_state_, lua_script.c_str()); + } + } + + lua_State* lua_state_; +}; +} // namespace lua +} // namespace ROCKSDB_NAMESPACE +#endif // LUA diff --git a/src/rocksdb/include/rocksdb/utilities/memory_util.h b/src/rocksdb/include/rocksdb/utilities/memory_util.h new file mode 100644 index 000000000..4f1606b51 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/memory_util.h @@ -0,0 +1,50 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#pragma once + +#include <map> +#include <string> +#include <unordered_set> +#include <vector> + +#include "rocksdb/cache.h" +#include "rocksdb/db.h" + +namespace ROCKSDB_NAMESPACE { + +// Returns the current memory usage of the specified DB instances. +class MemoryUtil { + public: + enum UsageType : int { + // Memory usage of all the mem-tables. + kMemTableTotal = 0, + // Memory usage of those un-flushed mem-tables. + kMemTableUnFlushed = 1, + // Memory usage of all the table readers. + kTableReadersTotal = 2, + // Memory usage by Cache. + kCacheTotal = 3, + kNumUsageTypes = 4 + }; + + // Returns the approximate memory usage of different types in the input + // list of DBs and Cache set. For instance, in the output map + // usage_by_type, usage_by_type[kMemTableTotal] will store the memory + // usage of all the mem-tables from all the input rocksdb instances. + // + // Note that for memory usage inside Cache class, we will + // only report the usage of the input "cache_set" without + // including those Cache usage inside the input list "dbs" + // of DBs. + static Status GetApproximateMemoryUsageByType( + const std::vector<DB*>& dbs, + const std::unordered_set<const Cache*> cache_set, + std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type); +}; +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/object_registry.h b/src/rocksdb/include/rocksdb/utilities/object_registry.h new file mode 100644 index 000000000..74a49d400 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/object_registry.h @@ -0,0 +1,205 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <functional> +#include <memory> +#include <regex> +#include <string> +#include <unordered_map> +#include <vector> +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; +// Returns a new T when called with a string. Populates the std::unique_ptr +// argument if granting ownership to caller. +template <typename T> +using FactoryFunc = + std::function<T*(const std::string&, std::unique_ptr<T>*, std::string*)>; + +class ObjectLibrary { + public: + // Base class for an Entry in the Registry. + class Entry { + public: + virtual ~Entry() {} + Entry(const std::string& name) : name_(std::move(name)) {} + + // Checks to see if the target matches this entry + virtual bool matches(const std::string& target) const { + return name_ == target; + } + const std::string& Name() const { return name_; } + + private: + const std::string name_; // The name of the Entry + }; // End class Entry + + // An Entry containing a FactoryFunc for creating new Objects + template <typename T> + class FactoryEntry : public Entry { + public: + FactoryEntry(const std::string& name, FactoryFunc<T> f) + : Entry(name), pattern_(std::move(name)), factory_(std::move(f)) {} + ~FactoryEntry() override {} + bool matches(const std::string& target) const override { + return std::regex_match(target, pattern_); + } + // Creates a new T object. + T* NewFactoryObject(const std::string& target, std::unique_ptr<T>* guard, + std::string* msg) const { + return factory_(target, guard, msg); + } + + private: + std::regex pattern_; // The pattern for this entry + FactoryFunc<T> factory_; + }; // End class FactoryEntry + public: + // Finds the entry matching the input name and type + const Entry* FindEntry(const std::string& type, + const std::string& name) const; + void Dump(Logger* logger) const; + + // Registers the factory with the library for the pattern. + // If the pattern matches, the factory may be used to create a new object. + template <typename T> + const FactoryFunc<T>& Register(const std::string& pattern, + const FactoryFunc<T>& factory) { + std::unique_ptr<Entry> entry(new FactoryEntry<T>(pattern, factory)); + AddEntry(T::Type(), entry); + return factory; + } + // Returns the default ObjectLibrary + static std::shared_ptr<ObjectLibrary>& Default(); + + private: + // Adds the input entry to the list for the given type + void AddEntry(const std::string& type, std::unique_ptr<Entry>& entry); + + // ** FactoryFunctions for this loader, organized by type + std::unordered_map<std::string, std::vector<std::unique_ptr<Entry>>> entries_; +}; + +// The ObjectRegistry is used to register objects that can be created by a +// name/pattern at run-time where the specific implementation of the object may +// not be known in advance. +class ObjectRegistry { + public: + static std::shared_ptr<ObjectRegistry> NewInstance(); + + ObjectRegistry(); + + void AddLibrary(const std::shared_ptr<ObjectLibrary>& library) { + libraries_.emplace_back(library); + } + + // Creates a new T using the factory function that was registered with a + // pattern that matches the provided "target" string according to + // std::regex_match. + // + // If no registered functions match, returns nullptr. If multiple functions + // match, the factory function used is unspecified. + // + // Populates res_guard with result pointer if caller is granted ownership. + template <typename T> + T* NewObject(const std::string& target, std::unique_ptr<T>* guard, + std::string* errmsg) { + guard->reset(); + const auto* basic = FindEntry(T::Type(), target); + if (basic != nullptr) { + const auto* factory = + static_cast<const ObjectLibrary::FactoryEntry<T>*>(basic); + return factory->NewFactoryObject(target, guard, errmsg); + } else { + *errmsg = std::string("Could not load ") + T::Type(); + return nullptr; + } + } + + // Creates a new unique T using the input factory functions. + // Returns OK if a new unique T was successfully created + // Returns NotFound if the type/target could not be created + // Returns InvalidArgument if the factory return an unguarded object + // (meaning it cannot be managed by a unique ptr) + template <typename T> + Status NewUniqueObject(const std::string& target, + std::unique_ptr<T>* result) { + std::string errmsg; + T* ptr = NewObject(target, result, &errmsg); + if (ptr == nullptr) { + return Status::NotFound(errmsg, target); + } else if (*result) { + return Status::OK(); + } else { + return Status::InvalidArgument(std::string("Cannot make a unique ") + + T::Type() + " from unguarded one ", + target); + } + } + + // Creates a new shared T using the input factory functions. + // Returns OK if a new shared T was successfully created + // Returns NotFound if the type/target could not be created + // Returns InvalidArgument if the factory return an unguarded object + // (meaning it cannot be managed by a shared ptr) + template <typename T> + Status NewSharedObject(const std::string& target, + std::shared_ptr<T>* result) { + std::string errmsg; + std::unique_ptr<T> guard; + T* ptr = NewObject(target, &guard, &errmsg); + if (ptr == nullptr) { + return Status::NotFound(errmsg, target); + } else if (guard) { + result->reset(guard.release()); + return Status::OK(); + } else { + return Status::InvalidArgument(std::string("Cannot make a shared ") + + T::Type() + " from unguarded one ", + target); + } + } + + // Creates a new static T using the input factory functions. + // Returns OK if a new static T was successfully created + // Returns NotFound if the type/target could not be created + // Returns InvalidArgument if the factory return a guarded object + // (meaning it is managed by a unique ptr) + template <typename T> + Status NewStaticObject(const std::string& target, T** result) { + std::string errmsg; + std::unique_ptr<T> guard; + T* ptr = NewObject(target, &guard, &errmsg); + if (ptr == nullptr) { + return Status::NotFound(errmsg, target); + } else if (guard.get()) { + return Status::InvalidArgument(std::string("Cannot make a static ") + + T::Type() + " from a guarded one ", + target); + } else { + *result = ptr; + return Status::OK(); + } + } + + // Dump the contents of the registry to the logger + void Dump(Logger* logger) const; + + private: + const ObjectLibrary::Entry* FindEntry(const std::string& type, + const std::string& name) const; + + // The set of libraries to search for factories for this registry. + // The libraries are searched in reverse order (back to front) when + // searching for entries. + std::vector<std::shared_ptr<ObjectLibrary>> libraries_; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h new file mode 100644 index 000000000..5356df71f --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h @@ -0,0 +1,98 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <string> +#include <vector> + +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/stackable_db.h" + +namespace ROCKSDB_NAMESPACE { + +class Transaction; + +// Database with Transaction support. +// +// See optimistic_transaction.h and examples/transaction_example.cc + +// Options to use when starting an Optimistic Transaction +struct OptimisticTransactionOptions { + // Setting set_snapshot=true is the same as calling SetSnapshot(). + bool set_snapshot = false; + + // Should be set if the DB has a non-default comparator. + // See comment in WriteBatchWithIndex constructor. + const Comparator* cmp = BytewiseComparator(); +}; + +enum class OccValidationPolicy { + // Validate serially at commit stage, AFTER entering the write-group. + // Isolation validation is processed single-threaded(since in the + // write-group). + // May suffer from high mutex contention, as per this link: + // https://github.com/facebook/rocksdb/issues/4402 + kValidateSerial = 0, + // Validate parallelly before commit stage, BEFORE entering the write-group to + // reduce mutex contention. Each txn acquires locks for its write-set + // records in some well-defined order. + kValidateParallel = 1 +}; + +struct OptimisticTransactionDBOptions { + OccValidationPolicy validate_policy = OccValidationPolicy::kValidateParallel; + + // works only if validate_policy == OccValidationPolicy::kValidateParallel + uint32_t occ_lock_buckets = (1 << 20); +}; + +class OptimisticTransactionDB : public StackableDB { + public: + // Open an OptimisticTransactionDB similar to DB::Open(). + static Status Open(const Options& options, const std::string& dbname, + OptimisticTransactionDB** dbptr); + + static Status Open(const DBOptions& db_options, const std::string& dbname, + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles, + OptimisticTransactionDB** dbptr); + + static Status Open(const DBOptions& db_options, + const OptimisticTransactionDBOptions& occ_options, + const std::string& dbname, + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles, + OptimisticTransactionDB** dbptr); + + virtual ~OptimisticTransactionDB() {} + + // Starts a new Transaction. + // + // Caller is responsible for deleting the returned transaction when no + // longer needed. + // + // If old_txn is not null, BeginTransaction will reuse this Transaction + // handle instead of allocating a new one. This is an optimization to avoid + // extra allocations when repeatedly creating transactions. + virtual Transaction* BeginTransaction( + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options = + OptimisticTransactionOptions(), + Transaction* old_txn = nullptr) = 0; + + OptimisticTransactionDB(const OptimisticTransactionDB&) = delete; + void operator=(const OptimisticTransactionDB&) = delete; + + protected: + // To Create an OptimisticTransactionDB, call Open() + explicit OptimisticTransactionDB(DB* db) : StackableDB(db) {} +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/option_change_migration.h b/src/rocksdb/include/rocksdb/utilities/option_change_migration.h new file mode 100644 index 000000000..cb1d0d117 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/option_change_migration.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <string> +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +// Try to migrate DB created with old_opts to be use new_opts. +// Multiple column families is not supported. +// It is best-effort. No guarantee to succeed. +// A full compaction may be executed. +Status OptionChangeMigration(std::string dbname, const Options& old_opts, + const Options& new_opts); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/options_util.h b/src/rocksdb/include/rocksdb/utilities/options_util.h new file mode 100644 index 000000000..1a29464a6 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/options_util.h @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// This file contains utility functions for RocksDB Options. +#pragma once + +#ifndef ROCKSDB_LITE + +#include <string> +#include <vector> + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +// Constructs the DBOptions and ColumnFamilyDescriptors by loading the +// latest RocksDB options file stored in the specified rocksdb database. +// +// Note that the all the pointer options (except table_factory, which will +// be described in more details below) will be initialized with the default +// values. Developers can further initialize them after this function call. +// Below is an example list of pointer options which will be initialized +// +// * env +// * memtable_factory +// * compaction_filter_factory +// * prefix_extractor +// * comparator +// * merge_operator +// * compaction_filter +// +// User can also choose to load customized comparator, env, and/or +// merge_operator through object registry: +// * comparator needs to be registered through Registrar<const Comparator> +// * env needs to be registered through Registrar<Env> +// * merge operator needs to be registered through +// Registrar<std::shared_ptr<MergeOperator>>. +// +// For table_factory, this function further supports deserializing +// BlockBasedTableFactory and its BlockBasedTableOptions except the +// pointer options of BlockBasedTableOptions (flush_block_policy_factory, +// block_cache, and block_cache_compressed), which will be initialized with +// default values. Developers can further specify these three options by +// casting the return value of TableFactoroy::GetOptions() to +// BlockBasedTableOptions and making necessary changes. +// +// ignore_unknown_options can be set to true if you want to ignore options +// that are from a newer version of the db, esentially for forward +// compatibility. +// +// examples/options_file_example.cc demonstrates how to use this function +// to open a RocksDB instance. +// +// @return the function returns an OK status when it went successfully. If +// the specified "dbpath" does not contain any option file, then a +// Status::NotFound will be returned. A return value other than +// Status::OK or Status::NotFound indicates there're some error related +// to the options file itself. +// +// @see LoadOptionsFromFile +Status LoadLatestOptions(const std::string& dbpath, Env* env, + DBOptions* db_options, + std::vector<ColumnFamilyDescriptor>* cf_descs, + bool ignore_unknown_options = false, + std::shared_ptr<Cache>* cache = {}); + +// Similar to LoadLatestOptions, this function constructs the DBOptions +// and ColumnFamilyDescriptors based on the specified RocksDB Options file. +// +// @see LoadLatestOptions +Status LoadOptionsFromFile(const std::string& options_file_name, Env* env, + DBOptions* db_options, + std::vector<ColumnFamilyDescriptor>* cf_descs, + bool ignore_unknown_options = false, + std::shared_ptr<Cache>* cache = {}); + +// Returns the latest options file name under the specified db path. +Status GetLatestOptionsFileName(const std::string& dbpath, Env* env, + std::string* options_file_name); + +// Returns Status::OK if the input DBOptions and ColumnFamilyDescriptors +// are compatible with the latest options stored in the specified DB path. +// +// If the return status is non-ok, it means the specified RocksDB instance +// might not be correctly opened with the input set of options. Currently, +// changing one of the following options will fail the compatibility check: +// +// * comparator +// * prefix_extractor +// * table_factory +// * merge_operator +Status CheckOptionsCompatibility( + const std::string& dbpath, Env* env, const DBOptions& db_options, + const std::vector<ColumnFamilyDescriptor>& cf_descs, + bool ignore_unknown_options = false); + +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/sim_cache.h b/src/rocksdb/include/rocksdb/utilities/sim_cache.h new file mode 100644 index 000000000..ba6f1d748 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/sim_cache.h @@ -0,0 +1,94 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <stdint.h> +#include <memory> +#include <string> +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class SimCache; + +// For instrumentation purpose, use NewSimCache instead of NewLRUCache API +// NewSimCache is a wrapper function returning a SimCache instance that can +// have additional interface provided in Simcache class besides Cache interface +// to predict block cache hit rate without actually allocating the memory. It +// can help users tune their current block cache size, and determine how +// efficient they are using the memory. +// +// Since GetSimCapacity() returns the capacity for simulutation, it differs from +// actual memory usage, which can be estimated as: +// sim_capacity * entry_size / (entry_size + block_size), +// where 76 <= entry_size <= 104, +// BlockBasedTableOptions.block_size = 4096 by default but is configurable, +// Therefore, generally the actual memory overhead of SimCache is Less than +// sim_capacity * 2% +extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> cache, + size_t sim_capacity, + int num_shard_bits); + +extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> sim_cache, + std::shared_ptr<Cache> cache, + int num_shard_bits); + +class SimCache : public Cache { + public: + SimCache() {} + + ~SimCache() override {} + + const char* Name() const override { return "SimCache"; } + + // returns the maximum configured capacity of the simcache for simulation + virtual size_t GetSimCapacity() const = 0; + + // simcache doesn't provide internal handler reference to user, so always + // PinnedUsage = 0 and the behavior will be not exactly consistent the + // with real cache. + // returns the memory size for the entries residing in the simcache. + virtual size_t GetSimUsage() const = 0; + + // sets the maximum configured capacity of the simcache. When the new + // capacity is less than the old capacity and the existing usage is + // greater than new capacity, the implementation will purge old entries + // to fit new capapicty. + virtual void SetSimCapacity(size_t capacity) = 0; + + // returns the lookup times of simcache + virtual uint64_t get_miss_counter() const = 0; + // returns the hit times of simcache + virtual uint64_t get_hit_counter() const = 0; + // reset the lookup and hit counters + virtual void reset_counter() = 0; + // String representation of the statistics of the simcache + virtual std::string ToString() const = 0; + + // Start storing logs of the cache activity (Add/Lookup) into + // a file located at activity_log_file, max_logging_size option can be used to + // stop logging to the file automatically after reaching a specific size in + // bytes, a values of 0 disable this feature + virtual Status StartActivityLogging(const std::string& activity_log_file, + Env* env, + uint64_t max_logging_size = 0) = 0; + + // Stop cache activity logging if any + virtual void StopActivityLogging() = 0; + + // Status of cache logging happening in background + virtual Status GetActivityLoggingStatus() = 0; + + private: + SimCache(const SimCache&); + SimCache& operator=(const SimCache&); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/stackable_db.h b/src/rocksdb/include/rocksdb/utilities/stackable_db.h new file mode 100644 index 000000000..9888fa22d --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/stackable_db.h @@ -0,0 +1,465 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <map> +#include <memory> +#include <string> +#include "rocksdb/db.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef DeleteFile +#endif + +namespace ROCKSDB_NAMESPACE { + +// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d +class StackableDB : public DB { + public: + // StackableDB take sole ownership of the underlying db. + explicit StackableDB(DB* db) : db_(db) {} + + // StackableDB take shared ownership of the underlying db. + explicit StackableDB(std::shared_ptr<DB> db) + : db_(db.get()), shared_db_ptr_(db) {} + + ~StackableDB() { + if (shared_db_ptr_ == nullptr) { + delete db_; + } else { + assert(shared_db_ptr_.get() == db_); + } + db_ = nullptr; + } + + virtual Status Close() override { return db_->Close(); } + + virtual DB* GetBaseDB() { return db_; } + + virtual DB* GetRootDB() override { return db_->GetRootDB(); } + + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) override { + return db_->CreateColumnFamily(options, column_family_name, handle); + } + + virtual Status CreateColumnFamilies( + const ColumnFamilyOptions& options, + const std::vector<std::string>& column_family_names, + std::vector<ColumnFamilyHandle*>* handles) override { + return db_->CreateColumnFamilies(options, column_family_names, handles); + } + + virtual Status CreateColumnFamilies( + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles) override { + return db_->CreateColumnFamilies(column_families, handles); + } + + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override { + return db_->DropColumnFamily(column_family); + } + + virtual Status DropColumnFamilies( + const std::vector<ColumnFamilyHandle*>& column_families) override { + return db_->DropColumnFamilies(column_families); + } + + virtual Status DestroyColumnFamilyHandle( + ColumnFamilyHandle* column_family) override { + return db_->DestroyColumnFamilyHandle(column_family); + } + + using DB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& val) override { + return db_->Put(options, column_family, key, val); + } + + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override { + return db_->Get(options, column_family, key, value); + } + + using DB::GetMergeOperands; + virtual Status GetMergeOperands( + const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* slice, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) override { + return db_->GetMergeOperands(options, column_family, key, slice, + get_merge_operands_options, + number_of_operands); + } + + using DB::MultiGet; + virtual std::vector<Status> MultiGet( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_family, + const std::vector<Slice>& keys, + std::vector<std::string>* values) override { + return db_->MultiGet(options, column_family, keys, values); + } + + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override { + return db_->MultiGet(options, column_family, num_keys, keys, + values, statuses, sorted_input); + } + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector<std::string>& external_files, + const IngestExternalFileOptions& options) override { + return db_->IngestExternalFile(column_family, external_files, options); + } + + using DB::IngestExternalFiles; + virtual Status IngestExternalFiles( + const std::vector<IngestExternalFileArg>& args) override { + return db_->IngestExternalFiles(args); + } + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) override { + return db_->CreateColumnFamilyWithImport(options, column_family_name, + import_options, metadata, handle); + } + + virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); } + + virtual Status VerifyChecksum(const ReadOptions& options) override { + return db_->VerifyChecksum(options); + } + + using DB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, + bool* value_found = nullptr) override { + return db_->KeyMayExist(options, column_family, key, value, value_found); + } + + using DB::Delete; + virtual Status Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override { + return db_->Delete(wopts, column_family, key); + } + + using DB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override { + return db_->SingleDelete(wopts, column_family, key); + } + + using DB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override { + return db_->Merge(options, column_family, key, value); + } + + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override { + return db_->Write(opts, updates); + } + + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) override { + return db_->NewIterator(opts, column_family); + } + + virtual Status NewIterators( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_families, + std::vector<Iterator*>* iterators) override { + return db_->NewIterators(options, column_families, iterators); + } + + virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); } + + virtual void ReleaseSnapshot(const Snapshot* snapshot) override { + return db_->ReleaseSnapshot(snapshot); + } + + using DB::GetMapProperty; + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) override { + return db_->GetProperty(column_family, property, value); + } + virtual bool GetMapProperty( + ColumnFamilyHandle* column_family, const Slice& property, + std::map<std::string, std::string>* value) override { + return db_->GetMapProperty(column_family, property, value); + } + + using DB::GetIntProperty; + virtual bool GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) override { + return db_->GetIntProperty(column_family, property, value); + } + + using DB::GetAggregatedIntProperty; + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* value) override { + return db_->GetAggregatedIntProperty(property, value); + } + + using DB::GetApproximateSizes; + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* r, int n, + uint64_t* sizes) override { + return db_->GetApproximateSizes(options, column_family, r, n, sizes); + } + + using DB::GetApproximateMemTableStats; + virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) override { + return db_->GetApproximateMemTableStats(column_family, range, count, size); + } + + using DB::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) override { + return db_->CompactRange(options, column_family, begin, end); + } + + using DB::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector<std::string>& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector<std::string>* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) override { + return db_->CompactFiles(compact_options, column_family, input_file_names, + output_level, output_path_id, output_file_names, + compaction_job_info); + } + + virtual Status PauseBackgroundWork() override { + return db_->PauseBackgroundWork(); + } + virtual Status ContinueBackgroundWork() override { + return db_->ContinueBackgroundWork(); + } + + virtual Status EnableAutoCompaction( + const std::vector<ColumnFamilyHandle*>& column_family_handles) override { + return db_->EnableAutoCompaction(column_family_handles); + } + + virtual void EnableManualCompaction() override { + return db_->EnableManualCompaction(); + } + virtual void DisableManualCompaction() override { + return db_->DisableManualCompaction(); + } + + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) override { + return db_->NumberLevels(column_family); + } + + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel( + ColumnFamilyHandle* column_family) override { + return db_->MaxMemCompactionLevel(column_family); + } + + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger( + ColumnFamilyHandle* column_family) override { + return db_->Level0StopWriteTrigger(column_family); + } + + virtual const std::string& GetName() const override { return db_->GetName(); } + + virtual Env* GetEnv() const override { return db_->GetEnv(); } + + virtual FileSystem* GetFileSystem() const override { + return db_->GetFileSystem(); + } + + using DB::GetOptions; + virtual Options GetOptions(ColumnFamilyHandle* column_family) const override { + return db_->GetOptions(column_family); + } + + using DB::GetDBOptions; + virtual DBOptions GetDBOptions() const override { + return db_->GetDBOptions(); + } + + using DB::Flush; + virtual Status Flush(const FlushOptions& fopts, + ColumnFamilyHandle* column_family) override { + return db_->Flush(fopts, column_family); + } + virtual Status Flush( + const FlushOptions& fopts, + const std::vector<ColumnFamilyHandle*>& column_families) override { + return db_->Flush(fopts, column_families); + } + + virtual Status SyncWAL() override { return db_->SyncWAL(); } + + virtual Status FlushWAL(bool sync) override { return db_->FlushWAL(sync); } + + virtual Status LockWAL() override { return db_->LockWAL(); } + + virtual Status UnlockWAL() override { return db_->UnlockWAL(); } + +#ifndef ROCKSDB_LITE + + virtual Status DisableFileDeletions() override { + return db_->DisableFileDeletions(); + } + + virtual Status EnableFileDeletions(bool force) override { + return db_->EnableFileDeletions(force); + } + + virtual void GetLiveFilesMetaData( + std::vector<LiveFileMetaData>* metadata) override { + db_->GetLiveFilesMetaData(metadata); + } + + virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* cf_meta) override { + db_->GetColumnFamilyMetaData(column_family, cf_meta); + } + + using DB::StartBlockCacheTrace; + Status StartBlockCacheTrace( + const TraceOptions& options, + std::unique_ptr<TraceWriter>&& trace_writer) override { + return db_->StartBlockCacheTrace(options, std::move(trace_writer)); + } + + using DB::EndBlockCacheTrace; + Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); } + +#endif // ROCKSDB_LITE + + virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs, + bool flush_memtable = true) override { + return db_->GetLiveFiles(vec, mfs, flush_memtable); + } + + virtual SequenceNumber GetLatestSequenceNumber() const override { + return db_->GetLatestSequenceNumber(); + } + + virtual bool SetPreserveDeletesSequenceNumber( + SequenceNumber seqnum) override { + return db_->SetPreserveDeletesSequenceNumber(seqnum); + } + + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + return db_->GetSortedWalFiles(files); + } + + virtual Status GetCurrentWalFile( + std::unique_ptr<LogFile>* current_log_file) override { + return db_->GetCurrentWalFile(current_log_file); + } + + virtual Status GetCreationTimeOfOldestFile( + uint64_t* creation_time) override { + return db_->GetCreationTimeOfOldestFile(creation_time); + } + + virtual Status DeleteFile(std::string name) override { + return db_->DeleteFile(name); + } + + virtual Status GetDbIdentity(std::string& identity) const override { + return db_->GetDbIdentity(identity); + } + + using DB::SetOptions; + virtual Status SetOptions(ColumnFamilyHandle* column_family_handle, + const std::unordered_map<std::string, std::string>& + new_options) override { + return db_->SetOptions(column_family_handle, new_options); + } + + virtual Status SetDBOptions( + const std::unordered_map<std::string, std::string>& new_options) + override { + return db_->SetDBOptions(new_options); + } + + using DB::ResetStats; + virtual Status ResetStats() override { return db_->ResetStats(); } + + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables( + ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override { + return db_->GetPropertiesOfAllTables(column_family, props); + } + + using DB::GetPropertiesOfTablesInRange; + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) override { + return db_->GetPropertiesOfTablesInRange(column_family, range, n, props); + } + + virtual Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter, + const TransactionLogIterator::ReadOptions& read_options) override { + return db_->GetUpdatesSince(seq_number, iter, read_options); + } + + virtual Status SuggestCompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, + const Slice* end) override { + return db_->SuggestCompactRange(column_family, begin, end); + } + + virtual Status PromoteL0(ColumnFamilyHandle* column_family, + int target_level) override { + return db_->PromoteL0(column_family, target_level); + } + + virtual ColumnFamilyHandle* DefaultColumnFamily() const override { + return db_->DefaultColumnFamily(); + } + +#ifndef ROCKSDB_LITE + Status TryCatchUpWithPrimary() override { + return db_->TryCatchUpWithPrimary(); + } +#endif // ROCKSDB_LITE + + protected: + DB* db_; + std::shared_ptr<DB> shared_db_ptr_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h new file mode 100644 index 000000000..b7ee88bc3 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h @@ -0,0 +1,74 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE +#include <atomic> +#include <memory> + +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +// A factory of a table property collector that marks a SST +// file as need-compaction when it observe at least "D" deletion +// entries in any "N" consecutive entires. +class CompactOnDeletionCollectorFactory + : public TablePropertiesCollectorFactory { + public: + virtual ~CompactOnDeletionCollectorFactory() {} + + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override; + + // Change the value of sliding_window_size "N" + // Setting it to 0 disables the delete triggered compaction + void SetWindowSize(size_t sliding_window_size) { + sliding_window_size_.store(sliding_window_size); + } + + // Change the value of deletion_trigger "D" + void SetDeletionTrigger(size_t deletion_trigger) { + deletion_trigger_.store(deletion_trigger); + } + + virtual const char* Name() const override { + return "CompactOnDeletionCollector"; + } + + private: + friend std::shared_ptr<CompactOnDeletionCollectorFactory> + NewCompactOnDeletionCollectorFactory(size_t sliding_window_size, + size_t deletion_trigger); + // A factory of a table property collector that marks a SST + // file as need-compaction when it observe at least "D" deletion + // entries in any "N" consecutive entires. + // + // @param sliding_window_size "N" + // @param deletion_trigger "D" + CompactOnDeletionCollectorFactory(size_t sliding_window_size, + size_t deletion_trigger) + : sliding_window_size_(sliding_window_size), + deletion_trigger_(deletion_trigger) {} + + std::atomic<size_t> sliding_window_size_; + std::atomic<size_t> deletion_trigger_; +}; + +// Creates a factory of a table property collector that marks a SST +// file as need-compaction when it observe at least "D" deletion +// entries in any "N" consecutive entires. +// +// @param sliding_window_size "N". Note that this number will be +// round up to the smallest multiple of 128 that is no less +// than the specified size. +// @param deletion_trigger "D". Note that even when "N" is changed, +// the specified number for "D" will not be changed. +extern std::shared_ptr<CompactOnDeletionCollectorFactory> +NewCompactOnDeletionCollectorFactory(size_t sliding_window_size, + size_t deletion_trigger); +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/transaction.h b/src/rocksdb/include/rocksdb/utilities/transaction.h new file mode 100644 index 000000000..d6c6722c8 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/transaction.h @@ -0,0 +1,540 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <string> +#include <vector> + +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Iterator; +class TransactionDB; +class WriteBatchWithIndex; + +using TransactionName = std::string; + +using TransactionID = uint64_t; + +// Provides notification to the caller of SetSnapshotOnNextOperation when +// the actual snapshot gets created +class TransactionNotifier { + public: + virtual ~TransactionNotifier() {} + + // Implement this method to receive notification when a snapshot is + // requested via SetSnapshotOnNextOperation. + virtual void SnapshotCreated(const Snapshot* newSnapshot) = 0; +}; + +// Provides BEGIN/COMMIT/ROLLBACK transactions. +// +// To use transactions, you must first create either an OptimisticTransactionDB +// or a TransactionDB. See examples/[optimistic_]transaction_example.cc for +// more information. +// +// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction(). +// +// It is up to the caller to synchronize access to this object. +// +// See examples/transaction_example.cc for some simple examples. +// +// TODO(agiardullo): Not yet implemented +// -PerfContext statistics +// -Support for using Transactions with DBWithTTL +class Transaction { + public: + // No copying allowed + Transaction(const Transaction&) = delete; + void operator=(const Transaction&) = delete; + + virtual ~Transaction() {} + + // If a transaction has a snapshot set, the transaction will ensure that + // any keys successfully written(or fetched via GetForUpdate()) have not + // been modified outside of this transaction since the time the snapshot was + // set. + // If a snapshot has not been set, the transaction guarantees that keys have + // not been modified since the time each key was first written (or fetched via + // GetForUpdate()). + // + // Using SetSnapshot() will provide stricter isolation guarantees at the + // expense of potentially more transaction failures due to conflicts with + // other writes. + // + // Calling SetSnapshot() has no effect on keys written before this function + // has been called. + // + // SetSnapshot() may be called multiple times if you would like to change + // the snapshot used for different operations in this transaction. + // + // Calling SetSnapshot will not affect the version of Data returned by Get() + // methods. See Transaction::Get() for more details. + virtual void SetSnapshot() = 0; + + // Similar to SetSnapshot(), but will not change the current snapshot + // until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called. + // By calling this function, the transaction will essentially call + // SetSnapshot() for you right before performing the next write/GetForUpdate. + // + // Calling SetSnapshotOnNextOperation() will not affect what snapshot is + // returned by GetSnapshot() until the next write/GetForUpdate is executed. + // + // When the snapshot is created the notifier's SnapshotCreated method will + // be called so that the caller can get access to the snapshot. + // + // This is an optimization to reduce the likelihood of conflicts that + // could occur in between the time SetSnapshot() is called and the first + // write/GetForUpdate operation. Eg, this prevents the following + // race-condition: + // + // txn1->SetSnapshot(); + // txn2->Put("A", ...); + // txn2->Commit(); + // txn1->GetForUpdate(opts, "A", ...); // FAIL! + virtual void SetSnapshotOnNextOperation( + std::shared_ptr<TransactionNotifier> notifier = nullptr) = 0; + + // Returns the Snapshot created by the last call to SetSnapshot(). + // + // REQUIRED: The returned Snapshot is only valid up until the next time + // SetSnapshot()/SetSnapshotOnNextSavePoint() is called, ClearSnapshot() + // is called, or the Transaction is deleted. + virtual const Snapshot* GetSnapshot() const = 0; + + // Clears the current snapshot (i.e. no snapshot will be 'set') + // + // This removes any snapshot that currently exists or is set to be created + // on the next update operation (SetSnapshotOnNextOperation). + // + // Calling ClearSnapshot() has no effect on keys written before this function + // has been called. + // + // If a reference to a snapshot was retrieved via GetSnapshot(), it will no + // longer be valid and should be discarded after a call to ClearSnapshot(). + virtual void ClearSnapshot() = 0; + + // Prepare the current transaction for 2PC + virtual Status Prepare() = 0; + + // Write all batched keys to the db atomically. + // + // Returns OK on success. + // + // May return any error status that could be returned by DB:Write(). + // + // If this transaction was created by an OptimisticTransactionDB(), + // Status::Busy() may be returned if the transaction could not guarantee + // that there are no write conflicts. Status::TryAgain() may be returned + // if the memtable history size is not large enough + // (See max_write_buffer_size_to_maintain). + // + // If this transaction was created by a TransactionDB(), Status::Expired() + // may be returned if this transaction has lived for longer than + // TransactionOptions.expiration. + virtual Status Commit() = 0; + + // Discard all batched writes in this transaction. + virtual Status Rollback() = 0; + + // Records the state of the transaction for future calls to + // RollbackToSavePoint(). May be called multiple times to set multiple save + // points. + virtual void SetSavePoint() = 0; + + // Undo all operations in this transaction (Put, Merge, Delete, PutLogData) + // since the most recent call to SetSavePoint() and removes the most recent + // SetSavePoint(). + // If there is no previous call to SetSavePoint(), returns Status::NotFound() + virtual Status RollbackToSavePoint() = 0; + + // Pop the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + virtual Status PopSavePoint() = 0; + + // This function is similar to DB::Get() except it will also read pending + // changes in this transaction. Currently, this function will return + // Status::MergeInProgress if the most recent write to the queried key in + // this batch is a Merge. + // + // If read_options.snapshot is not set, the current version of the key will + // be read. Calling SetSnapshot() does not affect the version of the data + // returned. + // + // Note that setting read_options.snapshot will affect what is read from the + // DB but will NOT change which keys are read from this transaction (the keys + // in this transaction do not yet belong to any snapshot and will be fetched + // regardless). + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) = 0; + + // An overload of the above method that receives a PinnableSlice + // For backward compatibility a default implementation is provided + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val) { + assert(pinnable_val != nullptr); + auto s = Get(options, column_family, key, pinnable_val->GetSelf()); + pinnable_val->PinSelf(); + return s; + } + + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value) = 0; + virtual Status Get(const ReadOptions& options, const Slice& key, + PinnableSlice* pinnable_val) { + assert(pinnable_val != nullptr); + auto s = Get(options, key, pinnable_val->GetSelf()); + pinnable_val->PinSelf(); + return s; + } + + virtual std::vector<Status> MultiGet( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_family, + const std::vector<Slice>& keys, std::vector<std::string>* values) = 0; + + virtual std::vector<Status> MultiGet(const ReadOptions& options, + const std::vector<Slice>& keys, + std::vector<std::string>* values) = 0; + + // Batched version of MultiGet - see DBImpl::MultiGet(). Sub-classes are + // expected to override this with an implementation that calls + // DBImpl::MultiGet() + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool /*sorted_input*/ = false) { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = Get(options, column_family, keys[i], &values[i]); + } + } + + // Read this key and ensure that this transaction will only + // be able to be committed if this key is not written outside this + // transaction after it has first been read (or after the snapshot if a + // snapshot is set in this transaction and do_validate is true). If + // do_validate is false, ReadOptions::snapshot is expected to be nullptr so + // that GetForUpdate returns the latest committed value. The transaction + // behavior is the same regardless of whether the key exists or not. + // + // Note: Currently, this function will return Status::MergeInProgress + // if the most recent write to the queried key in this batch is a Merge. + // + // The values returned by this function are similar to Transaction::Get(). + // If value==nullptr, then this function will not read any data, but will + // still ensure that this key cannot be written to by outside of this + // transaction. + // + // If this transaction was created by an OptimisticTransaction, GetForUpdate() + // could cause commit() to fail. Otherwise, it could return any error + // that could be returned by DB::Get(). + // + // If this transaction was created by a TransactionDB, it can return + // Status::OK() on success, + // Status::Busy() if there is a write conflict, + // Status::TimedOut() if a lock could not be acquired, + // Status::TryAgain() if the memtable history size is not large enough + // (See max_write_buffer_size_to_maintain) + // Status::MergeInProgress() if merge operations cannot be resolved. + // or other errors if this key could not be read. + virtual Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, + bool exclusive = true, + const bool do_validate = true) = 0; + + // An overload of the above method that receives a PinnableSlice + // For backward compatibility a default implementation is provided + virtual Status GetForUpdate(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val, + bool exclusive = true, + const bool do_validate = true) { + if (pinnable_val == nullptr) { + std::string* null_str = nullptr; + return GetForUpdate(options, column_family, key, null_str, exclusive, + do_validate); + } else { + auto s = GetForUpdate(options, column_family, key, + pinnable_val->GetSelf(), exclusive, do_validate); + pinnable_val->PinSelf(); + return s; + } + } + + virtual Status GetForUpdate(const ReadOptions& options, const Slice& key, + std::string* value, bool exclusive = true, + const bool do_validate = true) = 0; + + virtual std::vector<Status> MultiGetForUpdate( + const ReadOptions& options, + const std::vector<ColumnFamilyHandle*>& column_family, + const std::vector<Slice>& keys, std::vector<std::string>* values) = 0; + + virtual std::vector<Status> MultiGetForUpdate( + const ReadOptions& options, const std::vector<Slice>& keys, + std::vector<std::string>* values) = 0; + + // Returns an iterator that will iterate on all keys in the default + // column family including both keys in the DB and uncommitted keys in this + // transaction. + // + // Setting read_options.snapshot will affect what is read from the + // DB but will NOT change which keys are read from this transaction (the keys + // in this transaction do not yet belong to any snapshot and will be fetched + // regardless). + // + // Caller is responsible for deleting the returned Iterator. + // + // The returned iterator is only valid until Commit(), Rollback(), or + // RollbackToSavePoint() is called. + virtual Iterator* GetIterator(const ReadOptions& read_options) = 0; + + virtual Iterator* GetIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) = 0; + + // Put, Merge, Delete, and SingleDelete behave similarly to the corresponding + // functions in WriteBatch, but will also do conflict checking on the + // keys being written. + // + // assume_tracked=true expects the key be already tracked. More + // specifically, it means the the key was previous tracked in the same + // savepoint, with the same exclusive flag, and at a lower sequence number. + // If valid then it skips ValidateSnapshot. Returns error otherwise. + // + // If this Transaction was created on an OptimisticTransactionDB, these + // functions should always return Status::OK(). + // + // If this Transaction was created on a TransactionDB, the status returned + // can be: + // Status::OK() on success, + // Status::Busy() if there is a write conflict, + // Status::TimedOut() if a lock could not be acquired, + // Status::TryAgain() if the memtable history size is not large enough + // (See max_write_buffer_size_to_maintain) + // or other errors on unexpected failures. + virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, const bool assume_tracked = false) = 0; + virtual Status Put(const Slice& key, const Slice& value) = 0; + virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value, + const bool assume_tracked = false) = 0; + virtual Status Put(const SliceParts& key, const SliceParts& value) = 0; + + virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, + const bool assume_tracked = false) = 0; + virtual Status Merge(const Slice& key, const Slice& value) = 0; + + virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key, + const bool assume_tracked = false) = 0; + virtual Status Delete(const Slice& key) = 0; + virtual Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked = false) = 0; + virtual Status Delete(const SliceParts& key) = 0; + + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key, + const bool assume_tracked = false) = 0; + virtual Status SingleDelete(const Slice& key) = 0; + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key, + const bool assume_tracked = false) = 0; + virtual Status SingleDelete(const SliceParts& key) = 0; + + // PutUntracked() will write a Put to the batch of operations to be committed + // in this transaction. This write will only happen if this transaction + // gets committed successfully. But unlike Transaction::Put(), + // no conflict checking will be done for this key. + // + // If this Transaction was created on a PessimisticTransactionDB, this + // function will still acquire locks necessary to make sure this write doesn't + // cause conflicts in other transactions and may return Status::Busy(). + virtual Status PutUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) = 0; + virtual Status PutUntracked(const Slice& key, const Slice& value) = 0; + virtual Status PutUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key, + const SliceParts& value) = 0; + virtual Status PutUntracked(const SliceParts& key, + const SliceParts& value) = 0; + + virtual Status MergeUntracked(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) = 0; + virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0; + + virtual Status DeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + + virtual Status DeleteUntracked(const Slice& key) = 0; + virtual Status DeleteUntracked(ColumnFamilyHandle* column_family, + const SliceParts& key) = 0; + virtual Status DeleteUntracked(const SliceParts& key) = 0; + virtual Status SingleDeleteUntracked(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + + virtual Status SingleDeleteUntracked(const Slice& key) = 0; + + // Similar to WriteBatch::PutLogData + virtual void PutLogData(const Slice& blob) = 0; + + // By default, all Put/Merge/Delete operations will be indexed in the + // transaction so that Get/GetForUpdate/GetIterator can search for these + // keys. + // + // If the caller does not want to fetch the keys about to be written, + // they may want to avoid indexing as a performance optimization. + // Calling DisableIndexing() will turn off indexing for all future + // Put/Merge/Delete operations until EnableIndexing() is called. + // + // If a key is Put/Merge/Deleted after DisableIndexing is called and then + // is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is + // undefined. + virtual void DisableIndexing() = 0; + virtual void EnableIndexing() = 0; + + // Returns the number of distinct Keys being tracked by this transaction. + // If this transaction was created by a TransactionDB, this is the number of + // keys that are currently locked by this transaction. + // If this transaction was created by an OptimisticTransactionDB, this is the + // number of keys that need to be checked for conflicts at commit time. + virtual uint64_t GetNumKeys() const = 0; + + // Returns the number of Puts/Deletes/Merges that have been applied to this + // transaction so far. + virtual uint64_t GetNumPuts() const = 0; + virtual uint64_t GetNumDeletes() const = 0; + virtual uint64_t GetNumMerges() const = 0; + + // Returns the elapsed time in milliseconds since this Transaction began. + virtual uint64_t GetElapsedTime() const = 0; + + // Fetch the underlying write batch that contains all pending changes to be + // committed. + // + // Note: You should not write or delete anything from the batch directly and + // should only use the functions in the Transaction class to + // write to this transaction. + virtual WriteBatchWithIndex* GetWriteBatch() = 0; + + // Change the value of TransactionOptions.lock_timeout (in milliseconds) for + // this transaction. + // Has no effect on OptimisticTransactions. + virtual void SetLockTimeout(int64_t timeout) = 0; + + // Return the WriteOptions that will be used during Commit() + virtual WriteOptions* GetWriteOptions() = 0; + + // Reset the WriteOptions that will be used during Commit(). + virtual void SetWriteOptions(const WriteOptions& write_options) = 0; + + // If this key was previously fetched in this transaction using + // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell + // the transaction that it no longer needs to do any conflict checking + // for this key. + // + // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(), + // then UndoGetForUpdate will only have an effect if it is also called N + // times. If this key has been written to in this transaction, + // UndoGetForUpdate() will have no effect. + // + // If SetSavePoint() has been called after the GetForUpdate(), + // UndoGetForUpdate() will not have any effect. + // + // If this Transaction was created by an OptimisticTransactionDB, + // calling UndoGetForUpdate can affect whether this key is conflict checked + // at commit time. + // If this Transaction was created by a TransactionDB, + // calling UndoGetForUpdate may release any held locks for this key. + virtual void UndoGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual void UndoGetForUpdate(const Slice& key) = 0; + + virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) = 0; + + virtual WriteBatch* GetCommitTimeWriteBatch() = 0; + + virtual void SetLogNumber(uint64_t log) { log_number_ = log; } + + virtual uint64_t GetLogNumber() const { return log_number_; } + + virtual Status SetName(const TransactionName& name) = 0; + + virtual TransactionName GetName() const { return name_; } + + virtual TransactionID GetID() const { return 0; } + + virtual bool IsDeadlockDetect() const { return false; } + + virtual std::vector<TransactionID> GetWaitingTxns( + uint32_t* /*column_family_id*/, std::string* /*key*/) const { + assert(false); + return std::vector<TransactionID>(); + } + + enum TransactionState { + STARTED = 0, + AWAITING_PREPARE = 1, + PREPARED = 2, + AWAITING_COMMIT = 3, + COMMITED = 4, + AWAITING_ROLLBACK = 5, + ROLLEDBACK = 6, + LOCKS_STOLEN = 7, + }; + + TransactionState GetState() const { return txn_state_; } + void SetState(TransactionState state) { txn_state_ = state; } + + // NOTE: Experimental feature + // The globally unique id with which the transaction is identified. This id + // might or might not be set depending on the implementation. Similarly the + // implementation decides the point in lifetime of a transaction at which it + // assigns the id. Although currently it is the case, the id is not guaranteed + // to remain the same across restarts. + uint64_t GetId() { return id_; } + + protected: + explicit Transaction(const TransactionDB* /*db*/) {} + Transaction() : log_number_(0), txn_state_(STARTED) {} + + // the log in which the prepared section for this txn resides + // (for two phase commit) + uint64_t log_number_; + TransactionName name_; + + // Execution status of the transaction. + std::atomic<TransactionState> txn_state_; + + uint64_t id_ = 0; + virtual void SetId(uint64_t id) { + assert(id_ == 0); + id_ = id; + } + + virtual uint64_t GetLastLogNumber() const { return log_number_; } + + private: + friend class PessimisticTransactionDB; + friend class WriteUnpreparedTxnDB; + friend class TransactionTest_TwoPhaseLogRollingTest_Test; + friend class TransactionTest_TwoPhaseLogRollingTest2_Test; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db.h b/src/rocksdb/include/rocksdb/utilities/transaction_db.h new file mode 100644 index 000000000..73b7416a0 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/transaction_db.h @@ -0,0 +1,309 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <string> +#include <utility> +#include <vector> + +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/stackable_db.h" +#include "rocksdb/utilities/transaction.h" + +// Database with Transaction support. +// +// See transaction.h and examples/transaction_example.cc + +namespace ROCKSDB_NAMESPACE { + +class TransactionDBMutexFactory; + +enum TxnDBWritePolicy { + WRITE_COMMITTED = 0, // write only the committed data + WRITE_PREPARED, // write data after the prepare phase of 2pc + WRITE_UNPREPARED // write data before the prepare phase of 2pc +}; + +const uint32_t kInitialMaxDeadlocks = 5; + +struct TransactionDBOptions { + // Specifies the maximum number of keys that can be locked at the same time + // per column family. + // If the number of locked keys is greater than max_num_locks, transaction + // writes (or GetForUpdate) will return an error. + // If this value is not positive, no limit will be enforced. + int64_t max_num_locks = -1; + + // Stores the number of latest deadlocks to track + uint32_t max_num_deadlocks = kInitialMaxDeadlocks; + + // Increasing this value will increase the concurrency by dividing the lock + // table (per column family) into more sub-tables, each with their own + // separate + // mutex. + size_t num_stripes = 16; + + // If positive, specifies the default wait timeout in milliseconds when + // a transaction attempts to lock a key if not specified by + // TransactionOptions::lock_timeout. + // + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, there is no timeout. Not using a timeout is not recommended + // as it can lead to deadlocks. Currently, there is no deadlock-detection to + // recover + // from a deadlock. + int64_t transaction_lock_timeout = 1000; // 1 second + + // If positive, specifies the wait timeout in milliseconds when writing a key + // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write() + // directly). + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, there is no timeout and will block indefinitely when acquiring + // a lock. + // + // Not using a timeout can lead to deadlocks. Currently, there + // is no deadlock-detection to recover from a deadlock. While DB writes + // cannot deadlock with other DB writes, they can deadlock with a transaction. + // A negative timeout should only be used if all transactions have a small + // expiration set. + int64_t default_lock_timeout = 1000; // 1 second + + // If set, the TransactionDB will use this implementation of a mutex and + // condition variable for all transaction locking instead of the default + // mutex/condvar implementation. + std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory; + + // The policy for when to write the data into the DB. The default policy is to + // write only the committed data (WRITE_COMMITTED). The data could be written + // before the commit phase. The DB then needs to provide the mechanisms to + // tell apart committed from uncommitted data. + TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED; + + // TODO(myabandeh): remove this option + // Note: this is a temporary option as a hot fix in rollback of writeprepared + // txns in myrocks. MyRocks uses merge operands for autoinc column id without + // however obtaining locks. This breaks the assumption behind the rollback + // logic in myrocks. This hack of simply not rolling back merge operands works + // for the special way that myrocks uses this operands. + bool rollback_merge_operands = false; + + // If true, the TransactionDB implementation might skip concurrency control + // unless it is overridden by TransactionOptions or + // TransactionDBWriteOptimizations. This can be used in conjuction with + // DBOptions::unordered_write when the TransactionDB is used solely for write + // ordering rather than concurrency control. + bool skip_concurrency_control = false; + + // This option is only valid for write unprepared. If a write batch exceeds + // this threshold, then the transaction will implicitly flush the currently + // pending writes into the database. A value of 0 or less means no limit. + int64_t default_write_batch_flush_threshold = 0; + + private: + // 128 entries + size_t wp_snapshot_cache_bits = static_cast<size_t>(7); + // 8m entry, 64MB size + size_t wp_commit_cache_bits = static_cast<size_t>(23); + + // For testing, whether transaction name should be auto-generated or not. This + // is useful for write unprepared which requires named transactions. + bool autogenerate_name = false; + + friend class WritePreparedTxnDB; + friend class WriteUnpreparedTxn; + friend class WritePreparedTransactionTestBase; + friend class TransactionTestBase; + friend class MySQLStyleTransactionTest; +}; + +struct TransactionOptions { + // Setting set_snapshot=true is the same as calling + // Transaction::SetSnapshot(). + bool set_snapshot = false; + + // Setting to true means that before acquiring locks, this transaction will + // check if doing so will cause a deadlock. If so, it will return with + // Status::Busy. The user should retry their transaction. + bool deadlock_detect = false; + + // If set, it states that the CommitTimeWriteBatch represents the latest state + // of the application, has only one sub-batch, i.e., no duplicate keys, and + // meant to be used later during recovery. It enables an optimization to + // postpone updating the memtable with CommitTimeWriteBatch to only + // SwitchMemtable or recovery. + bool use_only_the_last_commit_time_batch_for_recovery = false; + + // TODO(agiardullo): TransactionDB does not yet support comparators that allow + // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only + // return 0 if + // a.compare(b) returns 0. + + // If positive, specifies the wait timeout in milliseconds when + // a transaction attempts to lock a key. + // + // If 0, no waiting is done if a lock cannot instantly be acquired. + // If negative, TransactionDBOptions::transaction_lock_timeout will be used. + int64_t lock_timeout = -1; + + // Expiration duration in milliseconds. If non-negative, transactions that + // last longer than this many milliseconds will fail to commit. If not set, + // a forgotten transaction that is never committed, rolled back, or deleted + // will never relinquish any locks it holds. This could prevent keys from + // being written by other writers. + int64_t expiration = -1; + + // The number of traversals to make during deadlock detection. + int64_t deadlock_detect_depth = 50; + + // The maximum number of bytes used for the write batch. 0 means no limit. + size_t max_write_batch_size = 0; + + // Skip Concurrency Control. This could be as an optimization if the + // application knows that the transaction would not have any conflict with + // concurrent transactions. It could also be used during recovery if (i) + // application guarantees no conflict between prepared transactions in the WAL + // (ii) application guarantees that recovered transactions will be rolled + // back/commit before new transactions start. + // Default: false + bool skip_concurrency_control = false; + + // See TransactionDBOptions::default_write_batch_flush_threshold for + // description. If a negative value is specified, then the default value from + // TransactionDBOptions is used. + int64_t write_batch_flush_threshold = -1; +}; + +// The per-write optimizations that do not involve transactions. TransactionDB +// implementation might or might not make use of the specified optimizations. +struct TransactionDBWriteOptimizations { + // If it is true it means that the application guarantees that the + // key-set in the write batch do not conflict with any concurrent transaction + // and hence the concurrency control mechanism could be skipped for this + // write. + bool skip_concurrency_control = false; + // If true, the application guarantees that there is no duplicate <column + // family, key> in the write batch and any employed mechanism to handle + // duplicate keys could be skipped. + bool skip_duplicate_key_check = false; +}; + +struct KeyLockInfo { + std::string key; + std::vector<TransactionID> ids; + bool exclusive; +}; + +struct DeadlockInfo { + TransactionID m_txn_id; + uint32_t m_cf_id; + bool m_exclusive; + std::string m_waiting_key; +}; + +struct DeadlockPath { + std::vector<DeadlockInfo> path; + bool limit_exceeded; + int64_t deadlock_time; + + explicit DeadlockPath(std::vector<DeadlockInfo> path_entry, + const int64_t& dl_time) + : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {} + + // empty path, limit exceeded constructor and default constructor + explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false) + : path(0), limit_exceeded(limit), deadlock_time(dl_time) {} + + bool empty() { return path.empty() && !limit_exceeded; } +}; + +class TransactionDB : public StackableDB { + public: + // Optimized version of ::Write that receives more optimization request such + // as skip_concurrency_control. + using StackableDB::Write; + virtual Status Write(const WriteOptions& opts, + const TransactionDBWriteOptimizations&, + WriteBatch* updates) { + // The default implementation ignores TransactionDBWriteOptimizations and + // falls back to the un-optimized version of ::Write + return Write(opts, updates); + } + // Open a TransactionDB similar to DB::Open(). + // Internally call PrepareWrap() and WrapDB() + // If the return status is not ok, then dbptr is set to nullptr. + static Status Open(const Options& options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, TransactionDB** dbptr); + + static Status Open(const DBOptions& db_options, + const TransactionDBOptions& txn_db_options, + const std::string& dbname, + const std::vector<ColumnFamilyDescriptor>& column_families, + std::vector<ColumnFamilyHandle*>* handles, + TransactionDB** dbptr); + // Note: PrepareWrap() may change parameters, make copies before the + // invocation if needed. + static void PrepareWrap(DBOptions* db_options, + std::vector<ColumnFamilyDescriptor>* column_families, + std::vector<size_t>* compaction_enabled_cf_indices); + // If the return status is not ok, then dbptr will bet set to nullptr. The + // input db parameter might or might not be deleted as a result of the + // failure. If it is properly deleted it will be set to nullptr. If the return + // status is ok, the ownership of db is transferred to dbptr. + static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options, + const std::vector<size_t>& compaction_enabled_cf_indices, + const std::vector<ColumnFamilyHandle*>& handles, + TransactionDB** dbptr); + // If the return status is not ok, then dbptr will bet set to nullptr. The + // input db parameter might or might not be deleted as a result of the + // failure. If it is properly deleted it will be set to nullptr. If the return + // status is ok, the ownership of db is transferred to dbptr. + static Status WrapStackableDB( + StackableDB* db, const TransactionDBOptions& txn_db_options, + const std::vector<size_t>& compaction_enabled_cf_indices, + const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr); + // Since the destructor in StackableDB is virtual, this destructor is virtual + // too. The root db will be deleted by the base's destructor. + ~TransactionDB() override {} + + // Starts a new Transaction. + // + // Caller is responsible for deleting the returned transaction when no + // longer needed. + // + // If old_txn is not null, BeginTransaction will reuse this Transaction + // handle instead of allocating a new one. This is an optimization to avoid + // extra allocations when repeatedly creating transactions. + virtual Transaction* BeginTransaction( + const WriteOptions& write_options, + const TransactionOptions& txn_options = TransactionOptions(), + Transaction* old_txn = nullptr) = 0; + + virtual Transaction* GetTransactionByName(const TransactionName& name) = 0; + virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0; + + // Returns set of all locks held. + // + // The mapping is column family id -> KeyLockInfo + virtual std::unordered_multimap<uint32_t, KeyLockInfo> + GetLockStatusData() = 0; + virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0; + virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0; + + protected: + // To Create an TransactionDB, call Open() + // The ownership of db is transferred to the base StackableDB + explicit TransactionDB(DB* db) : StackableDB(db) {} + // No copying allowed + TransactionDB(const TransactionDB&) = delete; + void operator=(const TransactionDB&) = delete; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h new file mode 100644 index 000000000..96a42adf8 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h @@ -0,0 +1,92 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <memory> + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// TransactionDBMutex and TransactionDBCondVar APIs allows applications to +// implement custom mutexes and condition variables to be used by a +// TransactionDB when locking keys. +// +// To open a TransactionDB with a custom TransactionDBMutexFactory, set +// TransactionDBOptions.custom_mutex_factory. + +class TransactionDBMutex { + public: + virtual ~TransactionDBMutex() {} + + // Attempt to acquire lock. Return OK on success, or other Status on failure. + // If returned status is OK, TransactionDB will eventually call UnLock(). + virtual Status Lock() = 0; + + // Attempt to acquire lock. If timeout is non-negative, operation may be + // failed after this many microseconds. + // Returns OK on success, + // TimedOut if timed out, + // or other Status on failure. + // If returned status is OK, TransactionDB will eventually call UnLock(). + virtual Status TryLockFor(int64_t timeout_time) = 0; + + // Unlock Mutex that was successfully locked by Lock() or TryLockUntil() + virtual void UnLock() = 0; +}; + +class TransactionDBCondVar { + public: + virtual ~TransactionDBCondVar() {} + + // Block current thread until condition variable is notified by a call to + // Notify() or NotifyAll(). Wait() will be called with mutex locked. + // Returns OK if notified. + // Returns non-OK if TransactionDB should stop waiting and fail the operation. + // May return OK spuriously even if not notified. + virtual Status Wait(std::shared_ptr<TransactionDBMutex> mutex) = 0; + + // Block current thread until condition variable is notified by a call to + // Notify() or NotifyAll(), or if the timeout is reached. + // Wait() will be called with mutex locked. + // + // If timeout is non-negative, operation should be failed after this many + // microseconds. + // If implementing a custom version of this class, the implementation may + // choose to ignore the timeout. + // + // Returns OK if notified. + // Returns TimedOut if timeout is reached. + // Returns other status if TransactionDB should otherwis stop waiting and + // fail the operation. + // May return OK spuriously even if not notified. + virtual Status WaitFor(std::shared_ptr<TransactionDBMutex> mutex, + int64_t timeout_time) = 0; + + // If any threads are waiting on *this, unblock at least one of the + // waiting threads. + virtual void Notify() = 0; + + // Unblocks all threads waiting on *this. + virtual void NotifyAll() = 0; +}; + +// Factory class that can allocate mutexes and condition variables. +class TransactionDBMutexFactory { + public: + // Create a TransactionDBMutex object. + virtual std::shared_ptr<TransactionDBMutex> AllocateMutex() = 0; + + // Create a TransactionDBCondVar object. + virtual std::shared_ptr<TransactionDBCondVar> AllocateCondVar() = 0; + + virtual ~TransactionDBMutexFactory() {} +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/utility_db.h b/src/rocksdb/include/rocksdb/utilities/utility_db.h new file mode 100644 index 000000000..cf2e5811c --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/utility_db.h @@ -0,0 +1,34 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE +#include <string> +#include <vector> + +#include "rocksdb/db.h" +#include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/stackable_db.h" + +namespace ROCKSDB_NAMESPACE { + +// Please don't use this class. It's deprecated +class UtilityDB { + public: + // This function is here only for backwards compatibility. Please use the + // functions defined in DBWithTTl (rocksdb/utilities/db_ttl.h) + // (deprecated) +#if defined(__GNUC__) || defined(__clang__) + __attribute__((deprecated)) +#elif _WIN32 + __declspec(deprecated) +#endif + static Status + OpenTtlDB(const Options& options, const std::string& name, + StackableDB** dbptr, int32_t ttl = 0, bool read_only = false); +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h new file mode 100644 index 000000000..424aa1582 --- /dev/null +++ b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h @@ -0,0 +1,278 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A WriteBatchWithIndex with a binary searchable index built for all the keys +// inserted. +#pragma once + +#ifndef ROCKSDB_LITE + +#include <memory> +#include <string> +#include <vector> + +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/write_batch_base.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyHandle; +class Comparator; +class DB; +class ReadCallback; +struct ReadOptions; +struct DBOptions; + +enum WriteType { + kPutRecord, + kMergeRecord, + kDeleteRecord, + kSingleDeleteRecord, + kDeleteRangeRecord, + kLogDataRecord, + kXIDRecord, +}; + +// an entry for Put, Merge, Delete, or SingleDelete entry for write batches. +// Used in WBWIIterator. +struct WriteEntry { + WriteType type; + Slice key; + Slice value; +}; + +// Iterator of one column family out of a WriteBatchWithIndex. +class WBWIIterator { + public: + virtual ~WBWIIterator() {} + + virtual bool Valid() const = 0; + + virtual void SeekToFirst() = 0; + + virtual void SeekToLast() = 0; + + virtual void Seek(const Slice& key) = 0; + + virtual void SeekForPrev(const Slice& key) = 0; + + virtual void Next() = 0; + + virtual void Prev() = 0; + + // the return WriteEntry is only valid until the next mutation of + // WriteBatchWithIndex + virtual WriteEntry Entry() const = 0; + + virtual Status status() const = 0; +}; + +// A WriteBatchWithIndex with a binary searchable index built for all the keys +// inserted. +// In Put(), Merge() Delete(), or SingleDelete(), the same function of the +// wrapped will be called. At the same time, indexes will be built. +// By calling GetWriteBatch(), a user will get the WriteBatch for the data +// they inserted, which can be used for DB::Write(). +// A user can call NewIterator() to create an iterator. +class WriteBatchWithIndex : public WriteBatchBase { + public: + // backup_index_comparator: the backup comparator used to compare keys + // within the same column family, if column family is not given in the + // interface, or we can't find a column family from the column family handle + // passed in, backup_index_comparator will be used for the column family. + // reserved_bytes: reserved bytes in underlying WriteBatch + // max_bytes: maximum size of underlying WriteBatch in bytes + // overwrite_key: if true, overwrite the key in the index when inserting + // the same key as previously, so iterator will never + // show two entries with the same key. + explicit WriteBatchWithIndex( + const Comparator* backup_index_comparator = BytewiseComparator(), + size_t reserved_bytes = 0, bool overwrite_key = false, + size_t max_bytes = 0); + + ~WriteBatchWithIndex() override; + WriteBatchWithIndex(WriteBatchWithIndex&&); + WriteBatchWithIndex& operator=(WriteBatchWithIndex&&); + + using WriteBatchBase::Put; + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + Status Put(const Slice& key, const Slice& value) override; + + using WriteBatchBase::Merge; + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + + Status Merge(const Slice& key, const Slice& value) override; + + using WriteBatchBase::Delete; + Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override; + Status Delete(const Slice& key) override; + + using WriteBatchBase::SingleDelete; + Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status SingleDelete(const Slice& key) override; + + using WriteBatchBase::DeleteRange; + Status DeleteRange(ColumnFamilyHandle* /* column_family */, + const Slice& /* begin_key */, + const Slice& /* end_key */) override { + return Status::NotSupported( + "DeleteRange unsupported in WriteBatchWithIndex"); + } + Status DeleteRange(const Slice& /* begin_key */, + const Slice& /* end_key */) override { + return Status::NotSupported( + "DeleteRange unsupported in WriteBatchWithIndex"); + } + + using WriteBatchBase::PutLogData; + Status PutLogData(const Slice& blob) override; + + using WriteBatchBase::Clear; + void Clear() override; + + using WriteBatchBase::GetWriteBatch; + WriteBatch* GetWriteBatch() override; + + // Create an iterator of a column family. User can call iterator.Seek() to + // search to the next entry of or after a key. Keys will be iterated in the + // order given by index_comparator. For multiple updates on the same key, + // each update will be returned as a separate entry, in the order of update + // time. + // + // The returned iterator should be deleted by the caller. + WBWIIterator* NewIterator(ColumnFamilyHandle* column_family); + // Create an iterator of the default column family. + WBWIIterator* NewIterator(); + + // Will create a new Iterator that will use WBWIIterator as a delta and + // base_iterator as base. + // + // This function is only supported if the WriteBatchWithIndex was + // constructed with overwrite_key=true. + // + // The returned iterator should be deleted by the caller. + // The base_iterator is now 'owned' by the returned iterator. Deleting the + // returned iterator will also delete the base_iterator. + // + // Updating write batch with the current key of the iterator is not safe. + // We strongly recommand users not to do it. It will invalidate the current + // key() and value() of the iterator. This invalidation happens even before + // the write batch update finishes. The state may recover after Next() is + // called. + Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family, + Iterator* base_iterator, + const ReadOptions* opts = nullptr); + // default column family + Iterator* NewIteratorWithBase(Iterator* base_iterator); + + // Similar to DB::Get() but will only read the key from this batch. + // If the batch does not have enough data to resolve Merge operations, + // MergeInProgress status may be returned. + Status GetFromBatch(ColumnFamilyHandle* column_family, + const DBOptions& options, const Slice& key, + std::string* value); + + // Similar to previous function but does not require a column_family. + // Note: An InvalidArgument status will be returned if there are any Merge + // operators for this key. Use previous method instead. + Status GetFromBatch(const DBOptions& options, const Slice& key, + std::string* value) { + return GetFromBatch(nullptr, options, key, value); + } + + // Similar to DB::Get() but will also read writes from this batch. + // + // This function will query both this batch and the DB and then merge + // the results using the DB's merge operator (if the batch contains any + // merge requests). + // + // Setting read_options.snapshot will affect what is read from the DB + // but will NOT change which keys are read from the batch (the keys in + // this batch do not yet belong to any snapshot and will be fetched + // regardless). + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + const Slice& key, std::string* value); + + // An overload of the above method that receives a PinnableSlice + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + const Slice& key, PinnableSlice* value); + + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value); + + // An overload of the above method that receives a PinnableSlice + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value); + + void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + bool sorted_input); + + // Records the state of the batch for future calls to RollbackToSavePoint(). + // May be called multiple times to set multiple save points. + void SetSavePoint() override; + + // Remove all entries in this batch (Put, Merge, Delete, SingleDelete, + // PutLogData) since the most recent call to SetSavePoint() and removes the + // most recent save point. + // If there is no previous call to SetSavePoint(), behaves the same as + // Clear(). + // + // Calling RollbackToSavePoint invalidates any open iterators on this batch. + // + // Returns Status::OK() on success, + // Status::NotFound() if no previous call to SetSavePoint(), + // or other Status on corruption. + Status RollbackToSavePoint() override; + + // Pop the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + Status PopSavePoint() override; + + void SetMaxBytes(size_t max_bytes) override; + size_t GetDataSize() const; + + private: + friend class PessimisticTransactionDB; + friend class WritePreparedTxn; + friend class WriteUnpreparedTxn; + friend class WriteBatchWithIndex_SubBatchCnt_Test; + // Returns the number of sub-batches inside the write batch. A sub-batch + // starts right before inserting a key that is a duplicate of a key in the + // last sub-batch. + size_t SubBatchCnt(); + + Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, ReadCallback* callback); + void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + bool sorted_input, ReadCallback* callback); + struct Rep; + std::unique_ptr<Rep> rep; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/include/rocksdb/version.h b/src/rocksdb/include/rocksdb/version.h new file mode 100644 index 000000000..d364b7f95 --- /dev/null +++ b/src/rocksdb/include/rocksdb/version.h @@ -0,0 +1,16 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#define ROCKSDB_MAJOR 6 +#define ROCKSDB_MINOR 8 +#define ROCKSDB_PATCH 1 + +// Do not use these. We made the mistake of declaring macros starting with +// double underscore. Now we have to live with our choice. We'll deprecate these +// at some point +#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR +#define __ROCKSDB_MINOR__ ROCKSDB_MINOR +#define __ROCKSDB_PATCH__ ROCKSDB_PATCH diff --git a/src/rocksdb/include/rocksdb/wal_filter.h b/src/rocksdb/include/rocksdb/wal_filter.h new file mode 100644 index 000000000..98eddc2e2 --- /dev/null +++ b/src/rocksdb/include/rocksdb/wal_filter.h @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <map> +#include <string> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteBatch; + +// WALFilter allows an application to inspect write-ahead-log (WAL) +// records or modify their processing on recovery. +// Please see the details below. +class WalFilter { + public: + enum class WalProcessingOption { + // Continue processing as usual + kContinueProcessing = 0, + // Ignore the current record but continue processing of log(s) + kIgnoreCurrentRecord = 1, + // Stop replay of logs and discard logs + // Logs won't be replayed on subsequent recovery + kStopReplay = 2, + // Corrupted record detected by filter + kCorruptedRecord = 3, + // Marker for enum count + kWalProcessingOptionMax = 4 + }; + + virtual ~WalFilter() {} + + // Provide ColumnFamily->LogNumber map to filter + // so that filter can determine whether a log number applies to a given + // column family (i.e. that log hasn't been flushed to SST already for the + // column family). + // We also pass in name->id map as only name is known during + // recovery (as handles are opened post-recovery). + // while write batch callbacks happen in terms of column family id. + // + // @params cf_lognumber_map column_family_id to lognumber map + // @params cf_name_id_map column_family_name to column_family_id map + + virtual void ColumnFamilyLogNumberMap( + const std::map<uint32_t, uint64_t>& /*cf_lognumber_map*/, + const std::map<std::string, uint32_t>& /*cf_name_id_map*/) {} + + // LogRecord is invoked for each log record encountered for all the logs + // during replay on logs on recovery. This method can be used to: + // * inspect the record (using the batch parameter) + // * ignoring current record + // (by returning WalProcessingOption::kIgnoreCurrentRecord) + // * reporting corrupted record + // (by returning WalProcessingOption::kCorruptedRecord) + // * stop log replay + // (by returning kStop replay) - please note that this implies + // discarding the logs from current record onwards. + // + // @params log_number log_number of the current log. + // Filter might use this to determine if the log + // record is applicable to a certain column family. + // @params log_file_name log file name - only for informational purposes + // @params batch batch encountered in the log during recovery + // @params new_batch new_batch to populate if filter wants to change + // the batch (for example to filter some records out, + // or alter some records). + // Please note that the new batch MUST NOT contain + // more records than original, else recovery would + // be failed. + // @params batch_changed Whether batch was changed by the filter. + // It must be set to true if new_batch was populated, + // else new_batch has no effect. + // @returns Processing option for the current record. + // Please see WalProcessingOption enum above for + // details. + virtual WalProcessingOption LogRecordFound( + unsigned long long /*log_number*/, const std::string& /*log_file_name*/, + const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) { + // Default implementation falls back to older function for compatibility + return LogRecord(batch, new_batch, batch_changed); + } + + // Please see the comments for LogRecord above. This function is for + // compatibility only and contains a subset of parameters. + // New code should use the function above. + virtual WalProcessingOption LogRecord(const WriteBatch& /*batch*/, + WriteBatch* /*new_batch*/, + bool* /*batch_changed*/) const { + return WalProcessingOption::kContinueProcessing; + } + + // Returns a name that identifies this WAL filter. + // The name will be printed to LOG file on start up for diagnosis. + virtual const char* Name() const = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/write_batch.h b/src/rocksdb/include/rocksdb/write_batch.h new file mode 100644 index 000000000..51fd4d8ac --- /dev/null +++ b/src/rocksdb/include/rocksdb/write_batch.h @@ -0,0 +1,377 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch holds a collection of updates to apply atomically to a DB. +// +// The updates are applied in the order in which they are added +// to the WriteBatch. For example, the value of "key" will be "v3" +// after the following batch is written: +// +// batch.Put("key", "v1"); +// batch.Delete("key"); +// batch.Put("key", "v2"); +// batch.Put("key", "v3"); +// +// Multiple threads can invoke const methods on a WriteBatch without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same WriteBatch must use +// external synchronization. + +#pragma once + +#include <stdint.h> +#include <atomic> +#include <memory> +#include <string> +#include <vector> +#include "rocksdb/status.h" +#include "rocksdb/write_batch_base.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class ColumnFamilyHandle; +struct SavePoints; +struct SliceParts; + +struct SavePoint { + size_t size; // size of rep_ + int count; // count of elements in rep_ + uint32_t content_flags; + + SavePoint() : size(0), count(0), content_flags(0) {} + + SavePoint(size_t _size, int _count, uint32_t _flags) + : size(_size), count(_count), content_flags(_flags) {} + + void clear() { + size = 0; + count = 0; + content_flags = 0; + } + + bool is_cleared() const { return (size | count | content_flags) == 0; } +}; + +class WriteBatch : public WriteBatchBase { + public: + explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0); + explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz); + ~WriteBatch() override; + + using WriteBatchBase::Put; + // Store the mapping "key->value" in the database. + Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status Put(const Slice& key, const Slice& value) override { + return Put(nullptr, key, value); + } + + // Variant of Put() that gathers output like writev(2). The key and value + // that will be written to the database are concatenations of arrays of + // slices. + Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status Put(const SliceParts& key, const SliceParts& value) override { + return Put(nullptr, key, value); + } + + using WriteBatchBase::Delete; + // If the database contains a mapping for "key", erase it. Else do nothing. + Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override; + Status Delete(const Slice& key) override { return Delete(nullptr, key); } + + // variant that takes SliceParts + Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status Delete(const SliceParts& key) override { return Delete(nullptr, key); } + + using WriteBatchBase::SingleDelete; + // WriteBatch implementation of DB::SingleDelete(). See db.h. + Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) override; + Status SingleDelete(const Slice& key) override { + return SingleDelete(nullptr, key); + } + + // variant that takes SliceParts + Status SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key) override; + Status SingleDelete(const SliceParts& key) override { + return SingleDelete(nullptr, key); + } + + using WriteBatchBase::DeleteRange; + // WriteBatch implementation of DB::DeleteRange(). See db.h. + Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key, + const Slice& end_key) override; + Status DeleteRange(const Slice& begin_key, const Slice& end_key) override { + return DeleteRange(nullptr, begin_key, end_key); + } + + // variant that takes SliceParts + Status DeleteRange(ColumnFamilyHandle* column_family, + const SliceParts& begin_key, + const SliceParts& end_key) override; + Status DeleteRange(const SliceParts& begin_key, + const SliceParts& end_key) override { + return DeleteRange(nullptr, begin_key, end_key); + } + + using WriteBatchBase::Merge; + // Merge "value" with the existing value of "key" in the database. + // "key->merge(existing, value)" + Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + Status Merge(const Slice& key, const Slice& value) override { + return Merge(nullptr, key, value); + } + + // variant that takes SliceParts + Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) override; + Status Merge(const SliceParts& key, const SliceParts& value) override { + return Merge(nullptr, key, value); + } + + using WriteBatchBase::PutLogData; + // Append a blob of arbitrary size to the records in this batch. The blob will + // be stored in the transaction log but not in any other file. In particular, + // it will not be persisted to the SST files. When iterating over this + // WriteBatch, WriteBatch::Handler::LogData will be called with the contents + // of the blob as it is encountered. Blobs, puts, deletes, and merges will be + // encountered in the same order in which they were inserted. The blob will + // NOT consume sequence number(s) and will NOT increase the count of the batch + // + // Example application: add timestamps to the transaction log for use in + // replication. + Status PutLogData(const Slice& blob) override; + + using WriteBatchBase::Clear; + // Clear all updates buffered in this batch. + void Clear() override; + + // Records the state of the batch for future calls to RollbackToSavePoint(). + // May be called multiple times to set multiple save points. + void SetSavePoint() override; + + // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the + // most recent call to SetSavePoint() and removes the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + Status RollbackToSavePoint() override; + + // Pop the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + Status PopSavePoint() override; + + // Support for iterating over the contents of a batch. + class Handler { + public: + virtual ~Handler(); + // All handler functions in this class provide default implementations so + // we won't break existing clients of Handler on a source code level when + // adding a new member function. + + // default implementation will just call Put without column family for + // backwards compatibility. If the column family is not default, + // the function is noop + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + // Put() historically doesn't return status. We didn't want to be + // backwards incompatible so we didn't change the return status + // (this is a public API). We do an ordinary get and return Status::OK() + Put(key, value); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and PutCF not implemented"); + } + virtual void Put(const Slice& /*key*/, const Slice& /*value*/) {} + + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { + if (column_family_id == 0) { + Delete(key); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and DeleteCF not implemented"); + } + virtual void Delete(const Slice& /*key*/) {} + + virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) { + if (column_family_id == 0) { + SingleDelete(key); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and SingleDeleteCF not implemented"); + } + virtual void SingleDelete(const Slice& /*key*/) {} + + virtual Status DeleteRangeCF(uint32_t /*column_family_id*/, + const Slice& /*begin_key*/, + const Slice& /*end_key*/) { + return Status::InvalidArgument("DeleteRangeCF not implemented"); + } + + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + Merge(key, value); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and MergeCF not implemented"); + } + virtual void Merge(const Slice& /*key*/, const Slice& /*value*/) {} + + virtual Status PutBlobIndexCF(uint32_t /*column_family_id*/, + const Slice& /*key*/, + const Slice& /*value*/) { + return Status::InvalidArgument("PutBlobIndexCF not implemented"); + } + + // The default implementation of LogData does nothing. + virtual void LogData(const Slice& blob); + + virtual Status MarkBeginPrepare(bool = false) { + return Status::InvalidArgument("MarkBeginPrepare() handler not defined."); + } + + virtual Status MarkEndPrepare(const Slice& /*xid*/) { + return Status::InvalidArgument("MarkEndPrepare() handler not defined."); + } + + virtual Status MarkNoop(bool /*empty_batch*/) { + return Status::InvalidArgument("MarkNoop() handler not defined."); + } + + virtual Status MarkRollback(const Slice& /*xid*/) { + return Status::InvalidArgument( + "MarkRollbackPrepare() handler not defined."); + } + + virtual Status MarkCommit(const Slice& /*xid*/) { + return Status::InvalidArgument("MarkCommit() handler not defined."); + } + + // Continue is called by WriteBatch::Iterate. If it returns false, + // iteration is halted. Otherwise, it continues iterating. The default + // implementation always returns true. + virtual bool Continue(); + + protected: + friend class WriteBatchInternal; + virtual bool WriteAfterCommit() const { return true; } + virtual bool WriteBeforePrepare() const { return false; } + }; + Status Iterate(Handler* handler) const; + + // Retrieve the serialized version of this batch. + const std::string& Data() const { return rep_; } + + // Retrieve data size of the batch. + size_t GetDataSize() const { return rep_.size(); } + + // Returns the number of updates in the batch + uint32_t Count() const; + + // Returns true if PutCF will be called during Iterate + bool HasPut() const; + + // Returns true if DeleteCF will be called during Iterate + bool HasDelete() const; + + // Returns true if SingleDeleteCF will be called during Iterate + bool HasSingleDelete() const; + + // Returns true if DeleteRangeCF will be called during Iterate + bool HasDeleteRange() const; + + // Returns true if MergeCF will be called during Iterate + bool HasMerge() const; + + // Returns true if MarkBeginPrepare will be called during Iterate + bool HasBeginPrepare() const; + + // Returns true if MarkEndPrepare will be called during Iterate + bool HasEndPrepare() const; + + // Returns trie if MarkCommit will be called during Iterate + bool HasCommit() const; + + // Returns trie if MarkRollback will be called during Iterate + bool HasRollback() const; + + // Assign timestamp to write batch + Status AssignTimestamp(const Slice& ts); + + // Assign timestamps to write batch + Status AssignTimestamps(const std::vector<Slice>& ts_list); + + using WriteBatchBase::GetWriteBatch; + WriteBatch* GetWriteBatch() override { return this; } + + // Constructor with a serialized string object + explicit WriteBatch(const std::string& rep); + explicit WriteBatch(std::string&& rep); + + WriteBatch(const WriteBatch& src); + WriteBatch(WriteBatch&& src) noexcept; + WriteBatch& operator=(const WriteBatch& src); + WriteBatch& operator=(WriteBatch&& src); + + // marks this point in the WriteBatch as the last record to + // be inserted into the WAL, provided the WAL is enabled + void MarkWalTerminationPoint(); + const SavePoint& GetWalTerminationPoint() const { return wal_term_point_; } + + void SetMaxBytes(size_t max_bytes) override { max_bytes_ = max_bytes; } + + private: + friend class WriteBatchInternal; + friend class LocalSavePoint; + // TODO(myabandeh): this is needed for a hack to collapse the write batch and + // remove duplicate keys. Remove it when the hack is replaced with a proper + // solution. + friend class WriteBatchWithIndex; + std::unique_ptr<SavePoints> save_points_; + + // When sending a WriteBatch through WriteImpl we might want to + // specify that only the first x records of the batch be written to + // the WAL. + SavePoint wal_term_point_; + + // For HasXYZ. Mutable to allow lazy computation of results + mutable std::atomic<uint32_t> content_flags_; + + // Performs deferred computation of content_flags if necessary + uint32_t ComputeContentFlags() const; + + // Maximum size of rep_. + size_t max_bytes_; + + // Is the content of the batch the application's latest state that meant only + // to be used for recovery? Refer to + // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery for + // more details. + bool is_latest_persistent_state_ = false; + + protected: + std::string rep_; // See comment in write_batch.cc for the format of rep_ + const size_t timestamp_size_; + + // Intentionally copyable +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/write_batch_base.h b/src/rocksdb/include/rocksdb/write_batch_base.h new file mode 100644 index 000000000..19ff877e7 --- /dev/null +++ b/src/rocksdb/include/rocksdb/write_batch_base.h @@ -0,0 +1,127 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <cstddef> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class Status; +class ColumnFamilyHandle; +class WriteBatch; +struct SliceParts; + +// Abstract base class that defines the basic interface for a write batch. +// See WriteBatch for a basic implementation and WrithBatchWithIndex for an +// indexed implementation. +class WriteBatchBase { + public: + virtual ~WriteBatchBase() {} + + // Store the mapping "key->value" in the database. + virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Put(const Slice& key, const Slice& value) = 0; + + // Variant of Put() that gathers output like writev(2). The key and value + // that will be written to the database are concatenations of arrays of + // slices. + virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value); + virtual Status Put(const SliceParts& key, const SliceParts& value); + + // Merge "value" with the existing value of "key" in the database. + // "key->merge(existing, value)" + virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) = 0; + virtual Status Merge(const Slice& key, const Slice& value) = 0; + + // variant that takes SliceParts + virtual Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value); + virtual Status Merge(const SliceParts& key, const SliceParts& value); + + // If the database contains a mapping for "key", erase it. Else do nothing. + virtual Status Delete(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status Delete(const Slice& key) = 0; + + // variant that takes SliceParts + virtual Status Delete(ColumnFamilyHandle* column_family, + const SliceParts& key); + virtual Status Delete(const SliceParts& key); + + // If the database contains a mapping for "key", erase it. Expects that the + // key was not overwritten. Else do nothing. + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual Status SingleDelete(const Slice& key) = 0; + + // variant that takes SliceParts + virtual Status SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key); + virtual Status SingleDelete(const SliceParts& key); + + // If the database contains mappings in the range ["begin_key", "end_key"), + // erase them. Else do nothing. + virtual Status DeleteRange(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) = 0; + virtual Status DeleteRange(const Slice& begin_key, const Slice& end_key) = 0; + + // variant that takes SliceParts + virtual Status DeleteRange(ColumnFamilyHandle* column_family, + const SliceParts& begin_key, + const SliceParts& end_key); + virtual Status DeleteRange(const SliceParts& begin_key, + const SliceParts& end_key); + + // Append a blob of arbitrary size to the records in this batch. The blob will + // be stored in the transaction log but not in any other file. In particular, + // it will not be persisted to the SST files. When iterating over this + // WriteBatch, WriteBatch::Handler::LogData will be called with the contents + // of the blob as it is encountered. Blobs, puts, deletes, and merges will be + // encountered in the same order in which they were inserted. The blob will + // NOT consume sequence number(s) and will NOT increase the count of the batch + // + // Example application: add timestamps to the transaction log for use in + // replication. + virtual Status PutLogData(const Slice& blob) = 0; + + // Clear all updates buffered in this batch. + virtual void Clear() = 0; + + // Covert this batch into a WriteBatch. This is an abstracted way of + // converting any WriteBatchBase(eg WriteBatchWithIndex) into a basic + // WriteBatch. + virtual WriteBatch* GetWriteBatch() = 0; + + // Records the state of the batch for future calls to RollbackToSavePoint(). + // May be called multiple times to set multiple save points. + virtual void SetSavePoint() = 0; + + // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the + // most recent call to SetSavePoint() and removes the most recent save point. + // If there is no previous call to SetSavePoint(), behaves the same as + // Clear(). + virtual Status RollbackToSavePoint() = 0; + + // Pop the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + virtual Status PopSavePoint() = 0; + + // Sets the maximum size of the write batch in bytes. 0 means no limit. + virtual void SetMaxBytes(size_t max_bytes) = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/include/rocksdb/write_buffer_manager.h b/src/rocksdb/include/rocksdb/write_buffer_manager.h new file mode 100644 index 000000000..ae1c98caf --- /dev/null +++ b/src/rocksdb/include/rocksdb/write_buffer_manager.h @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBufferManager is for managing memory allocation for one or more +// MemTables. + +#pragma once + +#include <atomic> +#include <cstddef> +#include "rocksdb/cache.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteBufferManager { + public: + // _buffer_size = 0 indicates no limit. Memory won't be capped. + // memory_usage() won't be valid and ShouldFlush() will always return true. + // if `cache` is provided, we'll put dummy entries in the cache and cost + // the memory allocated to the cache. It can be used even if _buffer_size = 0. + explicit WriteBufferManager(size_t _buffer_size, + std::shared_ptr<Cache> cache = {}); + // No copying allowed + WriteBufferManager(const WriteBufferManager&) = delete; + WriteBufferManager& operator=(const WriteBufferManager&) = delete; + + ~WriteBufferManager(); + + bool enabled() const { return buffer_size_ != 0; } + + bool cost_to_cache() const { return cache_rep_ != nullptr; } + + // Only valid if enabled() + size_t memory_usage() const { + return memory_used_.load(std::memory_order_relaxed); + } + size_t mutable_memtable_memory_usage() const { + return memory_active_.load(std::memory_order_relaxed); + } + size_t buffer_size() const { return buffer_size_; } + + // Should only be called from write thread + bool ShouldFlush() const { + if (enabled()) { + if (mutable_memtable_memory_usage() > mutable_limit_) { + return true; + } + if (memory_usage() >= buffer_size_ && + mutable_memtable_memory_usage() >= buffer_size_ / 2) { + // If the memory exceeds the buffer size, we trigger more aggressive + // flush. But if already more than half memory is being flushed, + // triggering more flush may not help. We will hold it instead. + return true; + } + } + return false; + } + + void ReserveMem(size_t mem) { + if (cache_rep_ != nullptr) { + ReserveMemWithCache(mem); + } else if (enabled()) { + memory_used_.fetch_add(mem, std::memory_order_relaxed); + } + if (enabled()) { + memory_active_.fetch_add(mem, std::memory_order_relaxed); + } + } + // We are in the process of freeing `mem` bytes, so it is not considered + // when checking the soft limit. + void ScheduleFreeMem(size_t mem) { + if (enabled()) { + memory_active_.fetch_sub(mem, std::memory_order_relaxed); + } + } + void FreeMem(size_t mem) { + if (cache_rep_ != nullptr) { + FreeMemWithCache(mem); + } else if (enabled()) { + memory_used_.fetch_sub(mem, std::memory_order_relaxed); + } + } + + private: + const size_t buffer_size_; + const size_t mutable_limit_; + std::atomic<size_t> memory_used_; + // Memory that hasn't been scheduled to free. + std::atomic<size_t> memory_active_; + struct CacheRep; + std::unique_ptr<CacheRep> cache_rep_; + + void ReserveMemWithCache(size_t mem); + void FreeMemWithCache(size_t mem); +}; +} // namespace ROCKSDB_NAMESPACE |