summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/tools
diff options
context:
space:
mode:
Diffstat (limited to 'src/rocksdb/tools')
-rw-r--r--src/rocksdb/tools/CMakeLists.txt30
-rw-r--r--src/rocksdb/tools/Dockerfile5
-rw-r--r--src/rocksdb/tools/advisor/README.md96
-rw-r--r--src/rocksdb/tools/advisor/advisor/__init__.py0
-rw-r--r--src/rocksdb/tools/advisor/advisor/bench_runner.py39
-rw-r--r--src/rocksdb/tools/advisor/advisor/config_optimizer_example.py134
-rw-r--r--src/rocksdb/tools/advisor/advisor/db_bench_runner.py245
-rw-r--r--src/rocksdb/tools/advisor/advisor/db_config_optimizer.py282
-rw-r--r--src/rocksdb/tools/advisor/advisor/db_log_parser.py131
-rw-r--r--src/rocksdb/tools/advisor/advisor/db_options_parser.py358
-rwxr-xr-xsrc/rocksdb/tools/advisor/advisor/db_stats_fetcher.py338
-rw-r--r--src/rocksdb/tools/advisor/advisor/db_timeseries_parser.py208
-rw-r--r--src/rocksdb/tools/advisor/advisor/ini_parser.py76
-rw-r--r--src/rocksdb/tools/advisor/advisor/rule_parser.py528
-rw-r--r--src/rocksdb/tools/advisor/advisor/rule_parser_example.py89
-rw-r--r--src/rocksdb/tools/advisor/advisor/rules.ini214
-rw-r--r--src/rocksdb/tools/advisor/test/__init__.py0
-rw-r--r--src/rocksdb/tools/advisor/test/input_files/LOG-030
-rw-r--r--src/rocksdb/tools/advisor/test/input_files/LOG-125
-rw-r--r--src/rocksdb/tools/advisor/test/input_files/OPTIONS-00000549
-rw-r--r--src/rocksdb/tools/advisor/test/input_files/log_stats_parser_keys_ts3
-rw-r--r--src/rocksdb/tools/advisor/test/input_files/rules_err1.ini56
-rw-r--r--src/rocksdb/tools/advisor/test/input_files/rules_err2.ini15
-rw-r--r--src/rocksdb/tools/advisor/test/input_files/rules_err3.ini15
-rw-r--r--src/rocksdb/tools/advisor/test/input_files/rules_err4.ini15
-rw-r--r--src/rocksdb/tools/advisor/test/input_files/test_rules.ini47
-rw-r--r--src/rocksdb/tools/advisor/test/input_files/triggered_rules.ini83
-rw-r--r--src/rocksdb/tools/advisor/test/test_db_bench_runner.py147
-rw-r--r--src/rocksdb/tools/advisor/test/test_db_log_parser.py103
-rw-r--r--src/rocksdb/tools/advisor/test/test_db_options_parser.py216
-rw-r--r--src/rocksdb/tools/advisor/test/test_db_stats_fetcher.py126
-rw-r--r--src/rocksdb/tools/advisor/test/test_rule_parser.py234
-rwxr-xr-xsrc/rocksdb/tools/analyze_txn_stress_test.sh77
-rwxr-xr-xsrc/rocksdb/tools/auto_sanity_test.sh93
-rwxr-xr-xsrc/rocksdb/tools/benchmark.sh525
-rwxr-xr-xsrc/rocksdb/tools/benchmark_leveldb.sh187
-rw-r--r--src/rocksdb/tools/blob_dump.cc110
-rw-r--r--src/rocksdb/tools/block_cache_analyzer/__init__.py2
-rw-r--r--src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py2000
-rw-r--r--src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh156
-rw-r--r--src/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py734
-rw-r--r--src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc2308
-rw-r--r--src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h393
-rw-r--r--src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py721
-rw-r--r--src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc717
-rw-r--r--src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc25
-rw-r--r--src/rocksdb/tools/check_all_python.py22
-rwxr-xr-xsrc/rocksdb/tools/check_format_compatible.sh191
-rw-r--r--src/rocksdb/tools/db_bench.cc21
-rw-r--r--src/rocksdb/tools/db_bench_tool.cc7048
-rw-r--r--src/rocksdb/tools/db_bench_tool_test.cc320
-rw-r--r--src/rocksdb/tools/db_crashtest.py499
-rw-r--r--src/rocksdb/tools/db_repl_stress.cc159
-rw-r--r--src/rocksdb/tools/db_sanity_test.cc297
-rwxr-xr-xsrc/rocksdb/tools/dbench_monitor102
-rw-r--r--src/rocksdb/tools/dump/db_dump_tool.cc259
-rw-r--r--src/rocksdb/tools/dump/rocksdb_dump.cc63
-rw-r--r--src/rocksdb/tools/dump/rocksdb_undump.cc62
-rwxr-xr-xsrc/rocksdb/tools/generate_random_db.sh31
-rwxr-xr-xsrc/rocksdb/tools/ingest_external_sst.sh18
-rw-r--r--src/rocksdb/tools/ldb.cc21
-rw-r--r--src/rocksdb/tools/ldb_cmd.cc3437
-rw-r--r--src/rocksdb/tools/ldb_cmd_impl.h628
-rw-r--r--src/rocksdb/tools/ldb_cmd_test.cc585
-rw-r--r--src/rocksdb/tools/ldb_test.py595
-rw-r--r--src/rocksdb/tools/ldb_tool.cc140
-rwxr-xr-xsrc/rocksdb/tools/pflag217
-rw-r--r--src/rocksdb/tools/rdb/.gitignore1
-rw-r--r--src/rocksdb/tools/rdb/API.md178
-rw-r--r--src/rocksdb/tools/rdb/README.md40
-rw-r--r--src/rocksdb/tools/rdb/binding.gyp25
-rw-r--r--src/rocksdb/tools/rdb/db_wrapper.cc526
-rw-r--r--src/rocksdb/tools/rdb/db_wrapper.h60
-rwxr-xr-xsrc/rocksdb/tools/rdb/rdb3
-rw-r--r--src/rocksdb/tools/rdb/rdb.cc16
-rw-r--r--src/rocksdb/tools/rdb/unit_test.js125
-rw-r--r--src/rocksdb/tools/reduce_levels_test.cc220
-rwxr-xr-xsrc/rocksdb/tools/regression_test.sh470
-rwxr-xr-xsrc/rocksdb/tools/report_lite_binary_size.sh42
-rwxr-xr-xsrc/rocksdb/tools/rocksdb_dump_test.sh9
-rwxr-xr-xsrc/rocksdb/tools/run_flash_bench.sh359
-rwxr-xr-xsrc/rocksdb/tools/run_leveldb.sh175
-rw-r--r--src/rocksdb/tools/sample-dump.dmpbin0 -> 100 bytes
-rw-r--r--src/rocksdb/tools/sst_dump.cc21
-rw-r--r--src/rocksdb/tools/sst_dump_test.cc282
-rw-r--r--src/rocksdb/tools/sst_dump_tool.cc778
-rw-r--r--src/rocksdb/tools/sst_dump_tool_imp.h87
-rw-r--r--src/rocksdb/tools/trace_analyzer.cc25
-rw-r--r--src/rocksdb/tools/trace_analyzer_test.cc727
-rw-r--r--src/rocksdb/tools/trace_analyzer_tool.cc2001
-rw-r--r--src/rocksdb/tools/trace_analyzer_tool.h292
-rwxr-xr-xsrc/rocksdb/tools/verify_random_db.sh39
-rwxr-xr-xsrc/rocksdb/tools/write_external_sst.sh25
-rw-r--r--src/rocksdb/tools/write_stress.cc305
-rw-r--r--src/rocksdb/tools/write_stress_runner.py74
95 files changed, 33605 insertions, 0 deletions
diff --git a/src/rocksdb/tools/CMakeLists.txt b/src/rocksdb/tools/CMakeLists.txt
new file mode 100644
index 000000000..4a4b0bcda
--- /dev/null
+++ b/src/rocksdb/tools/CMakeLists.txt
@@ -0,0 +1,30 @@
+set(CORE_TOOLS
+ sst_dump.cc
+ ldb.cc)
+foreach(src ${CORE_TOOLS})
+ get_filename_component(exename ${src} NAME_WE)
+ add_executable(${exename}${ARTIFACT_SUFFIX}
+ ${src})
+ target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${ROCKSDB_LIB})
+ list(APPEND core_tool_deps ${exename})
+endforeach()
+
+if(WITH_TOOLS)
+ set(TOOLS
+ db_sanity_test.cc
+ write_stress.cc
+ db_repl_stress.cc
+ dump/rocksdb_dump.cc
+ dump/rocksdb_undump.cc)
+ foreach(src ${TOOLS})
+ get_filename_component(exename ${src} NAME_WE)
+ add_executable(${exename}${ARTIFACT_SUFFIX}
+ ${src})
+ target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${ROCKSDB_LIB})
+ list(APPEND tool_deps ${exename})
+ endforeach()
+
+ add_custom_target(ldb_tests
+ COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/ldb_tests.py
+ DEPENDS ldb)
+endif()
diff --git a/src/rocksdb/tools/Dockerfile b/src/rocksdb/tools/Dockerfile
new file mode 100644
index 000000000..1d5ead7fd
--- /dev/null
+++ b/src/rocksdb/tools/Dockerfile
@@ -0,0 +1,5 @@
+FROM buildpack-deps:wheezy
+
+ADD ./ldb /rocksdb/tools/ldb
+
+CMD /rocksdb/tools/ldb
diff --git a/src/rocksdb/tools/advisor/README.md b/src/rocksdb/tools/advisor/README.md
new file mode 100644
index 000000000..f1e7165e4
--- /dev/null
+++ b/src/rocksdb/tools/advisor/README.md
@@ -0,0 +1,96 @@
+# Rocksdb Tuning Advisor
+
+## Motivation
+
+The performance of Rocksdb is contingent on its tuning. However,
+because of the complexity of its underlying technology and a large number of
+configurable parameters, a good configuration is sometimes hard to obtain. The aim of
+the python command-line tool, Rocksdb Advisor, is to automate the process of
+suggesting improvements in the configuration based on advice from Rocksdb
+experts.
+
+## Overview
+
+Experts share their wisdom as rules comprising of conditions and suggestions in the INI format (refer
+[rules.ini](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rules.ini)).
+Users provide the Rocksdb configuration that they want to improve upon (as the
+familiar Rocksdb OPTIONS file —
+[example](https://github.com/facebook/rocksdb/blob/master/examples/rocksdb_option_file_example.ini))
+and the path of the file which contains Rocksdb logs and statistics.
+The [Advisor](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser_example.py)
+creates appropriate DataSource objects (for Rocksdb
+[logs](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_log_parser.py),
+[options](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_options_parser.py),
+[statistics](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_stats_fetcher.py) etc.)
+and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser.py).
+The Rules uses rules from experts to parse data-sources and trigger appropriate rules.
+The Advisor's output gives information about which rules were triggered,
+why they were triggered and what each of them suggests. Each suggestion
+provided by a triggered rule advises some action on a Rocksdb
+configuration option, for example, increase CFOptions.write_buffer_size,
+set bloom_bits to 2 etc.
+
+## Usage
+
+### Prerequisites
+The tool needs the following to run:
+* python3
+
+### Running the tool
+An example command to run the tool:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m advisor.rule_parser_example --rules_spec=advisor/rules.ini --rocksdb_options=test/input_files/OPTIONS-000005 --log_files_path_prefix=test/input_files/LOG-0 --stats_dump_period_sec=20
+```
+
+### Command-line arguments
+
+Most important amongst all the input that the Advisor needs, are the rules
+spec and starting Rocksdb configuration. The configuration is provided as the
+familiar Rocksdb Options file (refer [example](https://github.com/facebook/rocksdb/blob/master/examples/rocksdb_option_file_example.ini)).
+The Rules spec is written in the INI format (more details in
+[rules.ini](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rules.ini)).
+
+In brief, a Rule is made of conditions and is triggered when all its
+constituent conditions are triggered. When triggered, a Rule suggests changes
+(increase/decrease/set to a suggested value) to certain Rocksdb options that
+aim to improve Rocksdb performance. Every Condition has a 'source' i.e.
+the data source that would be checked for triggering that condition.
+For example, a log Condition (with 'source=LOG') is triggered if a particular
+'regex' is found in the Rocksdb LOG files. As of now the Rules Engine
+supports 3 types of Conditions (and consequently data-sources):
+LOG, OPTIONS, TIME_SERIES. The TIME_SERIES data can be sourced from the
+Rocksdb [statistics](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/statistics.h)
+or [perf context](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/perf_context.h).
+
+For more information about the remaining command-line arguments, run:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m advisor.rule_parser_example --help
+```
+
+### Sample output
+
+Here, a Rocksdb log-based rule has been triggered:
+
+```shell
+Rule: stall-too-many-memtables
+LogCondition: stall-too-many-memtables regex: Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+Suggestion: inc-bg-flush option : DBOptions.max_background_flushes action : increase suggested_values : ['2']
+Suggestion: inc-write-buffer option : CFOptions.max_write_buffer_number action : increase
+scope: col_fam:
+{'default'}
+```
+
+## Running the tests
+
+Tests for the code have been added to the
+[test/](https://github.com/facebook/rocksdb/tree/master/tools/advisor/test)
+directory. For example, to run the unit tests for db_log_parser.py:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m unittest -v test.test_db_log_parser
+```
diff --git a/src/rocksdb/tools/advisor/advisor/__init__.py b/src/rocksdb/tools/advisor/advisor/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/__init__.py
diff --git a/src/rocksdb/tools/advisor/advisor/bench_runner.py b/src/rocksdb/tools/advisor/advisor/bench_runner.py
new file mode 100644
index 000000000..7c7ee7882
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/bench_runner.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+from abc import ABC, abstractmethod
+import re
+
+
+class BenchmarkRunner(ABC):
+ @staticmethod
+ @abstractmethod
+ def is_metric_better(new_metric, old_metric):
+ pass
+
+ @abstractmethod
+ def run_experiment(self):
+ # should return a list of DataSource objects
+ pass
+
+ @staticmethod
+ def get_info_log_file_name(log_dir, db_path):
+ # Example: DB Path = /dev/shm and OPTIONS file has option
+ # db_log_dir=/tmp/rocks/, then the name of the log file will be
+ # 'dev_shm_LOG' and its location will be /tmp/rocks. If db_log_dir is
+ # not specified in the OPTIONS file, then the location of the log file
+ # will be /dev/shm and the name of the file will be 'LOG'
+ file_name = ''
+ if log_dir:
+ # refer GetInfoLogPrefix() in rocksdb/util/filename.cc
+ # example db_path: /dev/shm/dbbench
+ file_name = db_path[1:] # to ignore the leading '/' character
+ to_be_replaced = re.compile('[^0-9a-zA-Z\-_\.]')
+ for character in to_be_replaced.findall(db_path):
+ file_name = file_name.replace(character, '_')
+ if not file_name.endswith('_'):
+ file_name += '_'
+ file_name += 'LOG'
+ return file_name
diff --git a/src/rocksdb/tools/advisor/advisor/config_optimizer_example.py b/src/rocksdb/tools/advisor/advisor/config_optimizer_example.py
new file mode 100644
index 000000000..e3736387e
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/config_optimizer_example.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+import argparse
+from advisor.db_config_optimizer import ConfigOptimizer
+from advisor.db_log_parser import NO_COL_FAMILY
+from advisor.db_options_parser import DatabaseOptions
+from advisor.rule_parser import RulesSpec
+
+
+CONFIG_OPT_NUM_ITER = 10
+
+
+def main(args):
+ # initialise the RulesSpec parser
+ rule_spec_parser = RulesSpec(args.rules_spec)
+ # initialise the benchmark runner
+ bench_runner_module = __import__(
+ args.benchrunner_module, fromlist=[args.benchrunner_class]
+ )
+ bench_runner_class = getattr(bench_runner_module, args.benchrunner_class)
+ ods_args = {}
+ if args.ods_client and args.ods_entity:
+ ods_args['client_script'] = args.ods_client
+ ods_args['entity'] = args.ods_entity
+ if args.ods_key_prefix:
+ ods_args['key_prefix'] = args.ods_key_prefix
+ db_bench_runner = bench_runner_class(args.benchrunner_pos_args, ods_args)
+ # initialise the database configuration
+ db_options = DatabaseOptions(args.rocksdb_options, args.misc_options)
+ # set the frequency at which stats are dumped in the LOG file and the
+ # location of the LOG file.
+ db_log_dump_settings = {
+ "DBOptions.stats_dump_period_sec": {
+ NO_COL_FAMILY: args.stats_dump_period_sec
+ }
+ }
+ db_options.update_options(db_log_dump_settings)
+ # initialise the configuration optimizer
+ config_optimizer = ConfigOptimizer(
+ db_bench_runner,
+ db_options,
+ rule_spec_parser,
+ args.base_db_path
+ )
+ # run the optimiser to improve the database configuration for given
+ # benchmarks, with the help of expert-specified rules
+ final_db_options = config_optimizer.run()
+ # generate the final rocksdb options file
+ print(
+ 'Final configuration in: ' +
+ final_db_options.generate_options_config('final')
+ )
+ print(
+ 'Final miscellaneous options: ' +
+ repr(final_db_options.get_misc_options())
+ )
+
+
+if __name__ == '__main__':
+ '''
+ An example run of this tool from the command-line would look like:
+ python3 -m advisor.config_optimizer_example
+ --base_db_path=/tmp/rocksdbtest-155919/dbbench
+ --rocksdb_options=temp/OPTIONS_boot.tmp --misc_options bloom_bits=2
+ --rules_spec=advisor/rules.ini --stats_dump_period_sec=20
+ --benchrunner_module=advisor.db_bench_runner
+ --benchrunner_class=DBBenchRunner --benchrunner_pos_args ./../../db_bench
+ readwhilewriting use_existing_db=true duration=90
+ '''
+ parser = argparse.ArgumentParser(description='This script is used for\
+ searching for a better database configuration')
+ parser.add_argument(
+ '--rocksdb_options', required=True, type=str,
+ help='path of the starting Rocksdb OPTIONS file'
+ )
+ # these are options that are column-family agnostic and are not yet
+ # supported by the Rocksdb Options file: eg. bloom_bits=2
+ parser.add_argument(
+ '--misc_options', nargs='*',
+ help='whitespace-separated list of options that are not supported ' +
+ 'by the Rocksdb OPTIONS file, given in the ' +
+ '<option_name>=<option_value> format eg. "bloom_bits=2 ' +
+ 'rate_limiter_bytes_per_sec=128000000"')
+ parser.add_argument(
+ '--base_db_path', required=True, type=str,
+ help='path for the Rocksdb database'
+ )
+ parser.add_argument(
+ '--rules_spec', required=True, type=str,
+ help='path of the file containing the expert-specified Rules'
+ )
+ parser.add_argument(
+ '--stats_dump_period_sec', required=True, type=int,
+ help='the frequency (in seconds) at which STATISTICS are printed to ' +
+ 'the Rocksdb LOG file'
+ )
+ # ODS arguments
+ parser.add_argument(
+ '--ods_client', type=str, help='the ODS client binary'
+ )
+ parser.add_argument(
+ '--ods_entity', type=str,
+ help='the servers for which the ODS stats need to be fetched'
+ )
+ parser.add_argument(
+ '--ods_key_prefix', type=str,
+ help='the prefix that needs to be attached to the keys of time ' +
+ 'series to be fetched from ODS'
+ )
+ # benchrunner_module example: advisor.db_benchmark_client
+ parser.add_argument(
+ '--benchrunner_module', required=True, type=str,
+ help='the module containing the BenchmarkRunner class to be used by ' +
+ 'the Optimizer, example: advisor.db_bench_runner'
+ )
+ # benchrunner_class example: DBBenchRunner
+ parser.add_argument(
+ '--benchrunner_class', required=True, type=str,
+ help='the name of the BenchmarkRunner class to be used by the ' +
+ 'Optimizer, should be present in the module provided in the ' +
+ 'benchrunner_module argument, example: DBBenchRunner'
+ )
+ parser.add_argument(
+ '--benchrunner_pos_args', nargs='*',
+ help='whitespace-separated positional arguments that are passed on ' +
+ 'to the constructor of the BenchmarkRunner class provided in the ' +
+ 'benchrunner_class argument, example: "use_existing_db=true ' +
+ 'duration=900"'
+ )
+ args = parser.parse_args()
+ main(args)
diff --git a/src/rocksdb/tools/advisor/advisor/db_bench_runner.py b/src/rocksdb/tools/advisor/advisor/db_bench_runner.py
new file mode 100644
index 000000000..54424440b
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/db_bench_runner.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+from advisor.bench_runner import BenchmarkRunner
+from advisor.db_log_parser import DataSource, DatabaseLogs, NO_COL_FAMILY
+from advisor.db_stats_fetcher import (
+ LogStatsParser, OdsStatsFetcher, DatabasePerfContext
+)
+import shutil
+import subprocess
+import time
+
+
+'''
+NOTE: This is not thread-safe, because the output file is simply overwritten.
+'''
+
+
+class DBBenchRunner(BenchmarkRunner):
+ OUTPUT_FILE = "temp/dbbench_out.tmp"
+ ERROR_FILE = "temp/dbbench_err.tmp"
+ DB_PATH = "DB path"
+ THROUGHPUT = "ops/sec"
+ PERF_CON = " PERF_CONTEXT:"
+
+ @staticmethod
+ def is_metric_better(new_metric, old_metric):
+ # for db_bench 'throughput' is the metric returned by run_experiment
+ return new_metric >= old_metric
+
+ @staticmethod
+ def get_opt_args_str(misc_options_dict):
+ # given a dictionary of options and their values, return a string
+ # that can be appended as command-line arguments
+ optional_args_str = ""
+ for option_name, option_value in misc_options_dict.items():
+ if option_value:
+ optional_args_str += (
+ " --" + option_name + "=" + str(option_value)
+ )
+ return optional_args_str
+
+ def __init__(self, positional_args, ods_args=None):
+ # parse positional_args list appropriately
+ self.db_bench_binary = positional_args[0]
+ self.benchmark = positional_args[1]
+ self.db_bench_args = None
+ if len(positional_args) > 2:
+ # options list with each option given as "<option>=<value>"
+ self.db_bench_args = positional_args[2:]
+ # save ods_args, if provided
+ self.ods_args = ods_args
+
+ def _parse_output(self, get_perf_context=False):
+ '''
+ Sample db_bench output after running 'readwhilewriting' benchmark:
+ DB path: [/tmp/rocksdbtest-155919/dbbench]\n
+ readwhilewriting : 16.582 micros/op 60305 ops/sec; 4.2 MB/s (3433828\
+ of 5427999 found)\n
+ PERF_CONTEXT:\n
+ user_key_comparison_count = 500466712, block_cache_hit_count = ...\n
+ '''
+ output = {
+ self.THROUGHPUT: None, self.DB_PATH: None, self.PERF_CON: None
+ }
+ perf_context_begins = False
+ with open(self.OUTPUT_FILE, 'r') as fp:
+ for line in fp:
+ if line.startswith(self.benchmark):
+ # line from sample output:
+ # readwhilewriting : 16.582 micros/op 60305 ops/sec; \
+ # 4.2 MB/s (3433828 of 5427999 found)\n
+ print(line) # print output of the benchmark run
+ token_list = line.strip().split()
+ for ix, token in enumerate(token_list):
+ if token.startswith(self.THROUGHPUT):
+ # in above example, throughput = 60305 ops/sec
+ output[self.THROUGHPUT] = (
+ float(token_list[ix - 1])
+ )
+ break
+ elif get_perf_context and line.startswith(self.PERF_CON):
+ # the following lines in the output contain perf context
+ # statistics (refer example above)
+ perf_context_begins = True
+ elif get_perf_context and perf_context_begins:
+ # Sample perf_context output:
+ # user_key_comparison_count = 500, block_cache_hit_count =\
+ # 468, block_read_count = 580, block_read_byte = 445, ...
+ token_list = line.strip().split(',')
+ # token_list = ['user_key_comparison_count = 500',
+ # 'block_cache_hit_count = 468','block_read_count = 580'...
+ perf_context = {
+ tk.split('=')[0].strip(): tk.split('=')[1].strip()
+ for tk in token_list
+ if tk
+ }
+ # TODO(poojam23): this is a hack and should be replaced
+ # with the timestamp that db_bench will provide per printed
+ # perf_context
+ timestamp = int(time.time())
+ perf_context_ts = {}
+ for stat in perf_context.keys():
+ perf_context_ts[stat] = {
+ timestamp: int(perf_context[stat])
+ }
+ output[self.PERF_CON] = perf_context_ts
+ perf_context_begins = False
+ elif line.startswith(self.DB_PATH):
+ # line from sample output:
+ # DB path: [/tmp/rocksdbtest-155919/dbbench]\n
+ output[self.DB_PATH] = (
+ line.split('[')[1].split(']')[0]
+ )
+ return output
+
+ def get_log_options(self, db_options, db_path):
+ # get the location of the LOG file and the frequency at which stats are
+ # dumped in the LOG file
+ log_dir_path = None
+ stats_freq_sec = None
+ logs_file_prefix = None
+
+ # fetch frequency at which the stats are dumped in the Rocksdb logs
+ dump_period = 'DBOptions.stats_dump_period_sec'
+ # fetch the directory, if specified, in which the Rocksdb logs are
+ # dumped, by default logs are dumped in same location as database
+ log_dir = 'DBOptions.db_log_dir'
+ log_options = db_options.get_options([dump_period, log_dir])
+ if dump_period in log_options:
+ stats_freq_sec = int(log_options[dump_period][NO_COL_FAMILY])
+ if log_dir in log_options:
+ log_dir_path = log_options[log_dir][NO_COL_FAMILY]
+
+ log_file_name = DBBenchRunner.get_info_log_file_name(
+ log_dir_path, db_path
+ )
+
+ if not log_dir_path:
+ log_dir_path = db_path
+ if not log_dir_path.endswith('/'):
+ log_dir_path += '/'
+
+ logs_file_prefix = log_dir_path + log_file_name
+ return (logs_file_prefix, stats_freq_sec)
+
+ def _get_options_command_line_args_str(self, curr_options):
+ '''
+ This method uses the provided Rocksdb OPTIONS to create a string of
+ command-line arguments for db_bench.
+ The --options_file argument is always given and the options that are
+ not supported by the OPTIONS file are given as separate arguments.
+ '''
+ optional_args_str = DBBenchRunner.get_opt_args_str(
+ curr_options.get_misc_options()
+ )
+ # generate an options configuration file
+ options_file = curr_options.generate_options_config(nonce='12345')
+ optional_args_str += " --options_file=" + options_file
+ return optional_args_str
+
+ def _setup_db_before_experiment(self, curr_options, db_path):
+ # remove destination directory if it already exists
+ try:
+ shutil.rmtree(db_path, ignore_errors=True)
+ except OSError as e:
+ print('Error: rmdir ' + e.filename + ' ' + e.strerror)
+ # setup database with a million keys using the fillrandom benchmark
+ command = "%s --benchmarks=fillrandom --db=%s --num=1000000" % (
+ self.db_bench_binary, db_path
+ )
+ args_str = self._get_options_command_line_args_str(curr_options)
+ command += args_str
+ self._run_command(command)
+
+ def _build_experiment_command(self, curr_options, db_path):
+ command = "%s --benchmarks=%s --statistics --perf_level=3 --db=%s" % (
+ self.db_bench_binary, self.benchmark, db_path
+ )
+ # fetch the command-line arguments string for providing Rocksdb options
+ args_str = self._get_options_command_line_args_str(curr_options)
+ # handle the command-line args passed in the constructor, these
+ # arguments are specific to db_bench
+ for cmd_line_arg in self.db_bench_args:
+ args_str += (" --" + cmd_line_arg)
+ command += args_str
+ return command
+
+ def _run_command(self, command):
+ out_file = open(self.OUTPUT_FILE, "w+")
+ err_file = open(self.ERROR_FILE, "w+")
+ print('executing... - ' + command)
+ subprocess.call(command, shell=True, stdout=out_file, stderr=err_file)
+ out_file.close()
+ err_file.close()
+
+ def run_experiment(self, db_options, db_path):
+ # setup the Rocksdb database before running experiment
+ self._setup_db_before_experiment(db_options, db_path)
+ # get the command to run the experiment
+ command = self._build_experiment_command(db_options, db_path)
+ experiment_start_time = int(time.time())
+ # run experiment
+ self._run_command(command)
+ experiment_end_time = int(time.time())
+ # parse the db_bench experiment output
+ parsed_output = self._parse_output(get_perf_context=True)
+
+ # get the log files path prefix and frequency at which Rocksdb stats
+ # are dumped in the logs
+ logs_file_prefix, stats_freq_sec = self.get_log_options(
+ db_options, parsed_output[self.DB_PATH]
+ )
+ # create the Rocksbd LOGS object
+ db_logs = DatabaseLogs(
+ logs_file_prefix, db_options.get_column_families()
+ )
+ # Create the Log STATS object
+ db_log_stats = LogStatsParser(logs_file_prefix, stats_freq_sec)
+ # Create the PerfContext STATS object
+ db_perf_context = DatabasePerfContext(
+ parsed_output[self.PERF_CON], 0, False
+ )
+ # create the data-sources dictionary
+ data_sources = {
+ DataSource.Type.DB_OPTIONS: [db_options],
+ DataSource.Type.LOG: [db_logs],
+ DataSource.Type.TIME_SERIES: [db_log_stats, db_perf_context]
+ }
+ # Create the ODS STATS object
+ if self.ods_args:
+ key_prefix = ''
+ if 'key_prefix' in self.ods_args:
+ key_prefix = self.ods_args['key_prefix']
+ data_sources[DataSource.Type.TIME_SERIES].append(OdsStatsFetcher(
+ self.ods_args['client_script'],
+ self.ods_args['entity'],
+ experiment_start_time,
+ experiment_end_time,
+ key_prefix
+ ))
+ # return the experiment's data-sources and throughput
+ return data_sources, parsed_output[self.THROUGHPUT]
diff --git a/src/rocksdb/tools/advisor/advisor/db_config_optimizer.py b/src/rocksdb/tools/advisor/advisor/db_config_optimizer.py
new file mode 100644
index 000000000..508c0f8fe
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/db_config_optimizer.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_log_parser import NO_COL_FAMILY
+from advisor.db_options_parser import DatabaseOptions
+from advisor.rule_parser import Suggestion
+import copy
+import random
+
+
+class ConfigOptimizer:
+ SCOPE = 'scope'
+ SUGG_VAL = 'suggested values'
+
+ @staticmethod
+ def apply_action_on_value(old_value, action, suggested_values):
+ chosen_sugg_val = None
+ if suggested_values:
+ chosen_sugg_val = random.choice(list(suggested_values))
+ new_value = None
+ if action is Suggestion.Action.set or not old_value:
+ assert(chosen_sugg_val)
+ new_value = chosen_sugg_val
+ else:
+ # For increase/decrease actions, currently the code tries to make
+ # a 30% change in the option's value per iteration. An addend is
+ # also present (+1 or -1) to handle the cases when the option's
+ # old value was 0 or the final int() conversion suppressed the 30%
+ # change made to the option
+ old_value = float(old_value)
+ mul = 0
+ add = 0
+ if action is Suggestion.Action.increase:
+ if old_value < 0:
+ mul = 0.7
+ add = 2
+ else:
+ mul = 1.3
+ add = 2
+ elif action is Suggestion.Action.decrease:
+ if old_value < 0:
+ mul = 1.3
+ add = -2
+ else:
+ mul = 0.7
+ add = -2
+ new_value = int(old_value * mul + add)
+ return new_value
+
+ @staticmethod
+ def improve_db_config(options, rule, suggestions_dict):
+ # this method takes ONE 'rule' and applies all its suggestions on the
+ # appropriate options
+ required_options = []
+ rule_suggestions = []
+ for sugg_name in rule.get_suggestions():
+ option = suggestions_dict[sugg_name].option
+ action = suggestions_dict[sugg_name].action
+ # A Suggestion in the rules spec must have the 'option' and
+ # 'action' fields defined, always call perform_checks() method
+ # after parsing the rules file using RulesSpec
+ assert(option)
+ assert(action)
+ required_options.append(option)
+ rule_suggestions.append(suggestions_dict[sugg_name])
+ current_config = options.get_options(required_options)
+ # Create the updated configuration from the rule's suggestions
+ updated_config = {}
+ for sugg in rule_suggestions:
+ # case: when the option is not present in the current configuration
+ if sugg.option not in current_config:
+ try:
+ new_value = ConfigOptimizer.apply_action_on_value(
+ None, sugg.action, sugg.suggested_values
+ )
+ if sugg.option not in updated_config:
+ updated_config[sugg.option] = {}
+ if DatabaseOptions.is_misc_option(sugg.option):
+ # this suggestion is on an option that is not yet
+ # supported by the Rocksdb OPTIONS file and so it is
+ # not prefixed by a section type.
+ updated_config[sugg.option][NO_COL_FAMILY] = new_value
+ else:
+ for col_fam in rule.get_trigger_column_families():
+ updated_config[sugg.option][col_fam] = new_value
+ except AssertionError:
+ print(
+ 'WARNING(ConfigOptimizer): provide suggested_values ' +
+ 'for ' + sugg.option
+ )
+ continue
+ # case: when the option is present in the current configuration
+ if NO_COL_FAMILY in current_config[sugg.option]:
+ old_value = current_config[sugg.option][NO_COL_FAMILY]
+ try:
+ new_value = ConfigOptimizer.apply_action_on_value(
+ old_value, sugg.action, sugg.suggested_values
+ )
+ if sugg.option not in updated_config:
+ updated_config[sugg.option] = {}
+ updated_config[sugg.option][NO_COL_FAMILY] = new_value
+ except AssertionError:
+ print(
+ 'WARNING(ConfigOptimizer): provide suggested_values ' +
+ 'for ' + sugg.option
+ )
+ else:
+ for col_fam in rule.get_trigger_column_families():
+ old_value = None
+ if col_fam in current_config[sugg.option]:
+ old_value = current_config[sugg.option][col_fam]
+ try:
+ new_value = ConfigOptimizer.apply_action_on_value(
+ old_value, sugg.action, sugg.suggested_values
+ )
+ if sugg.option not in updated_config:
+ updated_config[sugg.option] = {}
+ updated_config[sugg.option][col_fam] = new_value
+ except AssertionError:
+ print(
+ 'WARNING(ConfigOptimizer): provide ' +
+ 'suggested_values for ' + sugg.option
+ )
+ return current_config, updated_config
+
+ @staticmethod
+ def pick_rule_to_apply(rules, last_rule_name, rules_tried, backtrack):
+ if not rules:
+ print('\nNo more rules triggered!')
+ return None
+ # if the last rule provided an improvement in the database performance,
+ # and it was triggered again (i.e. it is present in 'rules'), then pick
+ # the same rule for this iteration too.
+ if last_rule_name and not backtrack:
+ for rule in rules:
+ if rule.name == last_rule_name:
+ return rule
+ # there was no previous rule OR the previous rule did not improve db
+ # performance OR it was not triggered for this iteration,
+ # then pick another rule that has not been tried yet
+ for rule in rules:
+ if rule.name not in rules_tried:
+ return rule
+ print('\nAll rules have been exhausted')
+ return None
+
+ @staticmethod
+ def apply_suggestions(
+ triggered_rules,
+ current_rule_name,
+ rules_tried,
+ backtrack,
+ curr_options,
+ suggestions_dict
+ ):
+ curr_rule = ConfigOptimizer.pick_rule_to_apply(
+ triggered_rules, current_rule_name, rules_tried, backtrack
+ )
+ if not curr_rule:
+ return tuple([None]*4)
+ # if a rule has been picked for improving db_config, update rules_tried
+ rules_tried.add(curr_rule.name)
+ # get updated config based on the picked rule
+ curr_conf, updated_conf = ConfigOptimizer.improve_db_config(
+ curr_options, curr_rule, suggestions_dict
+ )
+ conf_diff = DatabaseOptions.get_options_diff(curr_conf, updated_conf)
+ if not conf_diff: # the current and updated configs are the same
+ curr_rule, rules_tried, curr_conf, updated_conf = (
+ ConfigOptimizer.apply_suggestions(
+ triggered_rules,
+ None,
+ rules_tried,
+ backtrack,
+ curr_options,
+ suggestions_dict
+ )
+ )
+ print('returning from apply_suggestions')
+ return (curr_rule, rules_tried, curr_conf, updated_conf)
+
+ # TODO(poojam23): check if this method is required or can we directly set
+ # the config equal to the curr_config
+ @staticmethod
+ def get_backtrack_config(curr_config, updated_config):
+ diff = DatabaseOptions.get_options_diff(curr_config, updated_config)
+ bt_config = {}
+ for option in diff:
+ bt_config[option] = {}
+ for col_fam in diff[option]:
+ bt_config[option][col_fam] = diff[option][col_fam][0]
+ print(bt_config)
+ return bt_config
+
+ def __init__(self, bench_runner, db_options, rule_parser, base_db):
+ self.bench_runner = bench_runner
+ self.db_options = db_options
+ self.rule_parser = rule_parser
+ self.base_db_path = base_db
+
+ def run(self):
+ # In every iteration of this method's optimization loop we pick ONE
+ # RULE from all the triggered rules and apply all its suggestions to
+ # the appropriate options.
+ # bootstrapping the optimizer
+ print('Bootstrapping optimizer:')
+ options = copy.deepcopy(self.db_options)
+ old_data_sources, old_metric = (
+ self.bench_runner.run_experiment(options, self.base_db_path)
+ )
+ print('Initial metric: ' + str(old_metric))
+ self.rule_parser.load_rules_from_spec()
+ self.rule_parser.perform_section_checks()
+ triggered_rules = self.rule_parser.get_triggered_rules(
+ old_data_sources, options.get_column_families()
+ )
+ print('\nTriggered:')
+ self.rule_parser.print_rules(triggered_rules)
+ backtrack = False
+ rules_tried = set()
+ curr_rule, rules_tried, curr_conf, updated_conf = (
+ ConfigOptimizer.apply_suggestions(
+ triggered_rules,
+ None,
+ rules_tried,
+ backtrack,
+ options,
+ self.rule_parser.get_suggestions_dict()
+ )
+ )
+ # the optimizer loop
+ while curr_rule:
+ print('\nRule picked for next iteration:')
+ print(curr_rule.name)
+ print('\ncurrent config:')
+ print(curr_conf)
+ print('updated config:')
+ print(updated_conf)
+ options.update_options(updated_conf)
+ # run bench_runner with updated config
+ new_data_sources, new_metric = (
+ self.bench_runner.run_experiment(options, self.base_db_path)
+ )
+ print('\nnew metric: ' + str(new_metric))
+ backtrack = not self.bench_runner.is_metric_better(
+ new_metric, old_metric
+ )
+ # update triggered_rules, metric, data_sources, if required
+ if backtrack:
+ # revert changes to options config
+ print('\nBacktracking to previous configuration')
+ backtrack_conf = ConfigOptimizer.get_backtrack_config(
+ curr_conf, updated_conf
+ )
+ options.update_options(backtrack_conf)
+ else:
+ # run advisor on new data sources
+ self.rule_parser.load_rules_from_spec() # reboot the advisor
+ self.rule_parser.perform_section_checks()
+ triggered_rules = self.rule_parser.get_triggered_rules(
+ new_data_sources, options.get_column_families()
+ )
+ print('\nTriggered:')
+ self.rule_parser.print_rules(triggered_rules)
+ old_metric = new_metric
+ old_data_sources = new_data_sources
+ rules_tried = set()
+ # pick rule to work on and set curr_rule to that
+ curr_rule, rules_tried, curr_conf, updated_conf = (
+ ConfigOptimizer.apply_suggestions(
+ triggered_rules,
+ curr_rule.name,
+ rules_tried,
+ backtrack,
+ options,
+ self.rule_parser.get_suggestions_dict()
+ )
+ )
+ # return the final database options configuration
+ return options
diff --git a/src/rocksdb/tools/advisor/advisor/db_log_parser.py b/src/rocksdb/tools/advisor/advisor/db_log_parser.py
new file mode 100644
index 000000000..efd41a81a
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/db_log_parser.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+from abc import ABC, abstractmethod
+from calendar import timegm
+from enum import Enum
+import glob
+import re
+import time
+
+
+NO_COL_FAMILY = 'DB_WIDE'
+
+
+class DataSource(ABC):
+ class Type(Enum):
+ LOG = 1
+ DB_OPTIONS = 2
+ TIME_SERIES = 3
+
+ def __init__(self, type):
+ self.type = type
+
+ @abstractmethod
+ def check_and_trigger_conditions(self, conditions):
+ pass
+
+
+class Log:
+ @staticmethod
+ def is_new_log(log_line):
+ # The assumption is that a new log will start with a date printed in
+ # the below regex format.
+ date_regex = '\d{4}/\d{2}/\d{2}-\d{2}:\d{2}:\d{2}\.\d{6}'
+ return re.match(date_regex, log_line)
+
+ def __init__(self, log_line, column_families):
+ token_list = log_line.strip().split()
+ self.time = token_list[0]
+ self.context = token_list[1]
+ self.message = " ".join(token_list[2:])
+ self.column_family = None
+ # example log for 'default' column family:
+ # "2018/07/25-17:29:05.176080 7f969de68700 [db/compaction_job.cc:1634]
+ # [default] [JOB 3] Compacting 24@0 + 16@1 files to L1, score 6.00\n"
+ for col_fam in column_families:
+ search_for_str = '\[' + col_fam + '\]'
+ if re.search(search_for_str, self.message):
+ self.column_family = col_fam
+ break
+ if not self.column_family:
+ self.column_family = NO_COL_FAMILY
+
+ def get_human_readable_time(self):
+ # example from a log line: '2018/07/25-11:25:45.782710'
+ return self.time
+
+ def get_column_family(self):
+ return self.column_family
+
+ def get_context(self):
+ return self.context
+
+ def get_message(self):
+ return self.message
+
+ def append_message(self, remaining_log):
+ self.message = self.message + '\n' + remaining_log.strip()
+
+ def get_timestamp(self):
+ # example: '2018/07/25-11:25:45.782710' will be converted to the GMT
+ # Unix timestamp 1532517945 (note: this method assumes that self.time
+ # is in GMT)
+ hr_time = self.time + 'GMT'
+ timestamp = timegm(time.strptime(hr_time, "%Y/%m/%d-%H:%M:%S.%f%Z"))
+ return timestamp
+
+ def __repr__(self):
+ return (
+ 'time: ' + self.time + '; context: ' + self.context +
+ '; col_fam: ' + self.column_family +
+ '; message: ' + self.message
+ )
+
+
+class DatabaseLogs(DataSource):
+ def __init__(self, logs_path_prefix, column_families):
+ super().__init__(DataSource.Type.LOG)
+ self.logs_path_prefix = logs_path_prefix
+ self.column_families = column_families
+
+ def trigger_conditions_for_log(self, conditions, log):
+ # For a LogCondition object, trigger is:
+ # Dict[column_family_name, List[Log]]. This explains why the condition
+ # was triggered and for which column families.
+ for cond in conditions:
+ if re.search(cond.regex, log.get_message(), re.IGNORECASE):
+ trigger = cond.get_trigger()
+ if not trigger:
+ trigger = {}
+ if log.get_column_family() not in trigger:
+ trigger[log.get_column_family()] = []
+ trigger[log.get_column_family()].append(log)
+ cond.set_trigger(trigger)
+
+ def check_and_trigger_conditions(self, conditions):
+ for file_name in glob.glob(self.logs_path_prefix + '*'):
+ # TODO(poojam23): find a way to distinguish between log files
+ # - generated in the current experiment but are labeled 'old'
+ # because they LOGs exceeded the file size limit AND
+ # - generated in some previous experiment that are also labeled
+ # 'old' and were not deleted for some reason
+ if re.search('old', file_name, re.IGNORECASE):
+ continue
+ with open(file_name, 'r') as db_logs:
+ new_log = None
+ for line in db_logs:
+ if Log.is_new_log(line):
+ if new_log:
+ self.trigger_conditions_for_log(
+ conditions, new_log
+ )
+ new_log = Log(line, self.column_families)
+ else:
+ # To account for logs split into multiple lines
+ new_log.append_message(line)
+ # Check for the last log in the file.
+ if new_log:
+ self.trigger_conditions_for_log(conditions, new_log)
diff --git a/src/rocksdb/tools/advisor/advisor/db_options_parser.py b/src/rocksdb/tools/advisor/advisor/db_options_parser.py
new file mode 100644
index 000000000..e689d892a
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/db_options_parser.py
@@ -0,0 +1,358 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+import copy
+from advisor.db_log_parser import DataSource, NO_COL_FAMILY
+from advisor.ini_parser import IniParser
+import os
+
+
+class OptionsSpecParser(IniParser):
+ @staticmethod
+ def is_new_option(line):
+ return '=' in line
+
+ @staticmethod
+ def get_section_type(line):
+ '''
+ Example section header: [TableOptions/BlockBasedTable "default"]
+ Here ConfigurationOptimizer returned would be
+ 'TableOptions.BlockBasedTable'
+ '''
+ section_path = line.strip()[1:-1].split()[0]
+ section_type = '.'.join(section_path.split('/'))
+ return section_type
+
+ @staticmethod
+ def get_section_name(line):
+ # example: get_section_name('[CFOptions "default"]')
+ token_list = line.strip()[1:-1].split('"')
+ # token_list = ['CFOptions', 'default', '']
+ if len(token_list) < 3:
+ return None
+ return token_list[1] # return 'default'
+
+ @staticmethod
+ def get_section_str(section_type, section_name):
+ # Example:
+ # Case 1: get_section_str('DBOptions', NO_COL_FAMILY)
+ # Case 2: get_section_str('TableOptions.BlockBasedTable', 'default')
+ section_type = '/'.join(section_type.strip().split('.'))
+ # Case 1: section_type = 'DBOptions'
+ # Case 2: section_type = 'TableOptions/BlockBasedTable'
+ section_str = '[' + section_type
+ if section_name == NO_COL_FAMILY:
+ # Case 1: '[DBOptions]'
+ return (section_str + ']')
+ else:
+ # Case 2: '[TableOptions/BlockBasedTable "default"]'
+ return section_str + ' "' + section_name + '"]'
+
+ @staticmethod
+ def get_option_str(key, values):
+ option_str = key + '='
+ # get_option_str('db_log_dir', None), returns 'db_log_dir='
+ if values:
+ # example:
+ # get_option_str('max_bytes_for_level_multiplier_additional',
+ # [1,1,1,1,1,1,1]), returned string:
+ # 'max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1'
+ if isinstance(values, list):
+ for value in values:
+ option_str += (str(value) + ':')
+ option_str = option_str[:-1]
+ else:
+ # example: get_option_str('write_buffer_size', 1048576)
+ # returned string: 'write_buffer_size=1048576'
+ option_str += str(values)
+ return option_str
+
+
+class DatabaseOptions(DataSource):
+
+ @staticmethod
+ def is_misc_option(option_name):
+ # these are miscellaneous options that are not yet supported by the
+ # Rocksdb options file, hence they are not prefixed with any section
+ # name
+ return '.' not in option_name
+
+ @staticmethod
+ def get_options_diff(opt_old, opt_new):
+ # type: Dict[option, Dict[col_fam, value]] X 2 ->
+ # Dict[option, Dict[col_fam, Tuple(old_value, new_value)]]
+ # note: diff should contain a tuple of values only if they are
+ # different from each other
+ options_union = set(opt_old.keys()).union(set(opt_new.keys()))
+ diff = {}
+ for opt in options_union:
+ diff[opt] = {}
+ # if option in options_union, then it must be in one of the configs
+ if opt not in opt_old:
+ for col_fam in opt_new[opt]:
+ diff[opt][col_fam] = (None, opt_new[opt][col_fam])
+ elif opt not in opt_new:
+ for col_fam in opt_old[opt]:
+ diff[opt][col_fam] = (opt_old[opt][col_fam], None)
+ else:
+ for col_fam in opt_old[opt]:
+ if col_fam in opt_new[opt]:
+ if opt_old[opt][col_fam] != opt_new[opt][col_fam]:
+ diff[opt][col_fam] = (
+ opt_old[opt][col_fam],
+ opt_new[opt][col_fam]
+ )
+ else:
+ diff[opt][col_fam] = (opt_old[opt][col_fam], None)
+ for col_fam in opt_new[opt]:
+ if col_fam in opt_old[opt]:
+ if opt_old[opt][col_fam] != opt_new[opt][col_fam]:
+ diff[opt][col_fam] = (
+ opt_old[opt][col_fam],
+ opt_new[opt][col_fam]
+ )
+ else:
+ diff[opt][col_fam] = (None, opt_new[opt][col_fam])
+ if not diff[opt]:
+ diff.pop(opt)
+ return diff
+
+ def __init__(self, rocksdb_options, misc_options=None):
+ super().__init__(DataSource.Type.DB_OPTIONS)
+ # The options are stored in the following data structure:
+ # Dict[section_type, Dict[section_name, Dict[option_name, value]]]
+ self.options_dict = None
+ self.column_families = None
+ # Load the options from the given file to a dictionary.
+ self.load_from_source(rocksdb_options)
+ # Setup the miscellaneous options expected to be List[str], where each
+ # element in the List has the format "<option_name>=<option_value>"
+ # These options are the ones that are not yet supported by the Rocksdb
+ # OPTIONS file, so they are provided separately
+ self.setup_misc_options(misc_options)
+
+ def setup_misc_options(self, misc_options):
+ self.misc_options = {}
+ if misc_options:
+ for option_pair_str in misc_options:
+ option_name = option_pair_str.split('=')[0].strip()
+ option_value = option_pair_str.split('=')[1].strip()
+ self.misc_options[option_name] = option_value
+
+ def load_from_source(self, options_path):
+ self.options_dict = {}
+ with open(options_path, 'r') as db_options:
+ for line in db_options:
+ line = OptionsSpecParser.remove_trailing_comment(line)
+ if not line:
+ continue
+ if OptionsSpecParser.is_section_header(line):
+ curr_sec_type = (
+ OptionsSpecParser.get_section_type(line)
+ )
+ curr_sec_name = OptionsSpecParser.get_section_name(line)
+ if curr_sec_type not in self.options_dict:
+ self.options_dict[curr_sec_type] = {}
+ if not curr_sec_name:
+ curr_sec_name = NO_COL_FAMILY
+ self.options_dict[curr_sec_type][curr_sec_name] = {}
+ # example: if the line read from the Rocksdb OPTIONS file
+ # is [CFOptions "default"], then the section type is
+ # CFOptions and 'default' is the name of a column family
+ # that for this database, so it's added to the list of
+ # column families stored in this object
+ if curr_sec_type == 'CFOptions':
+ if not self.column_families:
+ self.column_families = []
+ self.column_families.append(curr_sec_name)
+ elif OptionsSpecParser.is_new_option(line):
+ key, value = OptionsSpecParser.get_key_value_pair(line)
+ self.options_dict[curr_sec_type][curr_sec_name][key] = (
+ value
+ )
+ else:
+ error = 'Not able to parse line in Options file.'
+ OptionsSpecParser.exit_with_parse_error(line, error)
+
+ def get_misc_options(self):
+ # these are options that are not yet supported by the Rocksdb OPTIONS
+ # file, hence they are provided and stored separately
+ return self.misc_options
+
+ def get_column_families(self):
+ return self.column_families
+
+ def get_all_options(self):
+ # This method returns all the options that are stored in this object as
+ # a: Dict[<sec_type>.<option_name>: Dict[col_fam, option_value]]
+ all_options = []
+ # Example: in the section header '[CFOptions "default"]' read from the
+ # OPTIONS file, sec_type='CFOptions'
+ for sec_type in self.options_dict:
+ for col_fam in self.options_dict[sec_type]:
+ for opt_name in self.options_dict[sec_type][col_fam]:
+ option = sec_type + '.' + opt_name
+ all_options.append(option)
+ all_options.extend(list(self.misc_options.keys()))
+ return self.get_options(all_options)
+
+ def get_options(self, reqd_options):
+ # type: List[str] -> Dict[str, Dict[str, Any]]
+ # List[option] -> Dict[option, Dict[col_fam, value]]
+ reqd_options_dict = {}
+ for option in reqd_options:
+ if DatabaseOptions.is_misc_option(option):
+ # the option is not prefixed by '<section_type>.' because it is
+ # not yet supported by the Rocksdb OPTIONS file; so it has to
+ # be fetched from the misc_options dictionary
+ if option not in self.misc_options:
+ continue
+ if option not in reqd_options_dict:
+ reqd_options_dict[option] = {}
+ reqd_options_dict[option][NO_COL_FAMILY] = (
+ self.misc_options[option]
+ )
+ else:
+ # Example: option = 'TableOptions.BlockBasedTable.block_align'
+ # then, sec_type = 'TableOptions.BlockBasedTable'
+ sec_type = '.'.join(option.split('.')[:-1])
+ # opt_name = 'block_align'
+ opt_name = option.split('.')[-1]
+ if sec_type not in self.options_dict:
+ continue
+ for col_fam in self.options_dict[sec_type]:
+ if opt_name in self.options_dict[sec_type][col_fam]:
+ if option not in reqd_options_dict:
+ reqd_options_dict[option] = {}
+ reqd_options_dict[option][col_fam] = (
+ self.options_dict[sec_type][col_fam][opt_name]
+ )
+ return reqd_options_dict
+
+ def update_options(self, options):
+ # An example 'options' object looks like:
+ # {'DBOptions.max_background_jobs': {NO_COL_FAMILY: 2},
+ # 'CFOptions.write_buffer_size': {'default': 1048576, 'cf_A': 128000},
+ # 'bloom_bits': {NO_COL_FAMILY: 4}}
+ for option in options:
+ if DatabaseOptions.is_misc_option(option):
+ # this is a misc_option i.e. an option that is not yet
+ # supported by the Rocksdb OPTIONS file, so it is not prefixed
+ # by '<section_type>.' and must be stored in the separate
+ # misc_options dictionary
+ if NO_COL_FAMILY not in options[option]:
+ print(
+ 'WARNING(DatabaseOptions.update_options): not ' +
+ 'updating option ' + option + ' because it is in ' +
+ 'misc_option format but its scope is not ' +
+ NO_COL_FAMILY + '. Check format of option.'
+ )
+ continue
+ self.misc_options[option] = options[option][NO_COL_FAMILY]
+ else:
+ sec_name = '.'.join(option.split('.')[:-1])
+ opt_name = option.split('.')[-1]
+ if sec_name not in self.options_dict:
+ self.options_dict[sec_name] = {}
+ for col_fam in options[option]:
+ # if the option is not already present in the dictionary,
+ # it will be inserted, else it will be updated to the new
+ # value
+ if col_fam not in self.options_dict[sec_name]:
+ self.options_dict[sec_name][col_fam] = {}
+ self.options_dict[sec_name][col_fam][opt_name] = (
+ copy.deepcopy(options[option][col_fam])
+ )
+
+ def generate_options_config(self, nonce):
+ # this method generates a Rocksdb OPTIONS file in the INI format from
+ # the options stored in self.options_dict
+ this_path = os.path.abspath(os.path.dirname(__file__))
+ file_name = '../temp/OPTIONS_' + str(nonce) + '.tmp'
+ file_path = os.path.join(this_path, file_name)
+ with open(file_path, 'w') as fp:
+ for section in self.options_dict:
+ for col_fam in self.options_dict[section]:
+ fp.write(
+ OptionsSpecParser.get_section_str(section, col_fam) +
+ '\n'
+ )
+ for option in self.options_dict[section][col_fam]:
+ values = self.options_dict[section][col_fam][option]
+ fp.write(
+ OptionsSpecParser.get_option_str(option, values) +
+ '\n'
+ )
+ fp.write('\n')
+ return file_path
+
+ def check_and_trigger_conditions(self, conditions):
+ for cond in conditions:
+ reqd_options_dict = self.get_options(cond.options)
+ # This contains the indices of options that are specific to some
+ # column family and are not database-wide options.
+ incomplete_option_ix = []
+ options = []
+ missing_reqd_option = False
+ for ix, option in enumerate(cond.options):
+ if option not in reqd_options_dict:
+ print(
+ 'WARNING(DatabaseOptions.check_and_trigger): ' +
+ 'skipping condition ' + cond.name + ' because it '
+ 'requires option ' + option + ' but this option is' +
+ ' not available'
+ )
+ missing_reqd_option = True
+ break # required option is absent
+ if NO_COL_FAMILY in reqd_options_dict[option]:
+ options.append(reqd_options_dict[option][NO_COL_FAMILY])
+ else:
+ options.append(None)
+ incomplete_option_ix.append(ix)
+
+ if missing_reqd_option:
+ continue
+
+ # if all the options are database-wide options
+ if not incomplete_option_ix:
+ try:
+ if eval(cond.eval_expr):
+ cond.set_trigger({NO_COL_FAMILY: options})
+ except Exception as e:
+ print(
+ 'WARNING(DatabaseOptions) check_and_trigger:' + str(e)
+ )
+ continue
+
+ # for all the options that are not database-wide, we look for their
+ # values specific to column families
+ col_fam_options_dict = {}
+ for col_fam in self.column_families:
+ present = True
+ for ix in incomplete_option_ix:
+ option = cond.options[ix]
+ if col_fam not in reqd_options_dict[option]:
+ present = False
+ break
+ options[ix] = reqd_options_dict[option][col_fam]
+ if present:
+ try:
+ if eval(cond.eval_expr):
+ col_fam_options_dict[col_fam] = (
+ copy.deepcopy(options)
+ )
+ except Exception as e:
+ print(
+ 'WARNING(DatabaseOptions) check_and_trigger: ' +
+ str(e)
+ )
+ # Trigger for an OptionCondition object is of the form:
+ # Dict[col_fam_name: List[option_value]]
+ # where col_fam_name is the name of a column family for which
+ # 'eval_expr' evaluated to True and List[option_value] is the list
+ # of values of the options specified in the condition's 'options'
+ # field
+ if col_fam_options_dict:
+ cond.set_trigger(col_fam_options_dict)
diff --git a/src/rocksdb/tools/advisor/advisor/db_stats_fetcher.py b/src/rocksdb/tools/advisor/advisor/db_stats_fetcher.py
new file mode 100755
index 000000000..cf497cf1f
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/db_stats_fetcher.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_log_parser import Log
+from advisor.db_timeseries_parser import TimeSeriesData, NO_ENTITY
+import copy
+import glob
+import re
+import subprocess
+import time
+
+
+class LogStatsParser(TimeSeriesData):
+ STATS = 'STATISTICS:'
+
+ @staticmethod
+ def parse_log_line_for_stats(log_line):
+ # Example stat line (from LOG file):
+ # "rocksdb.db.get.micros P50 : 8.4 P95 : 21.8 P99 : 33.9 P100 : 92.0\n"
+ token_list = log_line.strip().split()
+ # token_list = ['rocksdb.db.get.micros', 'P50', ':', '8.4', 'P95', ':',
+ # '21.8', 'P99', ':', '33.9', 'P100', ':', '92.0']
+ stat_prefix = token_list[0] + '.' # 'rocksdb.db.get.micros.'
+ stat_values = [
+ token
+ for token in token_list[1:]
+ if token != ':'
+ ]
+ # stat_values = ['P50', '8.4', 'P95', '21.8', 'P99', '33.9', 'P100',
+ # '92.0']
+ stat_dict = {}
+ for ix, metric in enumerate(stat_values):
+ if ix % 2 == 0:
+ stat_name = stat_prefix + metric
+ stat_name = stat_name.lower() # Note: case insensitive names
+ else:
+ stat_dict[stat_name] = float(metric)
+ # stat_dict = {'rocksdb.db.get.micros.p50': 8.4,
+ # 'rocksdb.db.get.micros.p95': 21.8, 'rocksdb.db.get.micros.p99': 33.9,
+ # 'rocksdb.db.get.micros.p100': 92.0}
+ return stat_dict
+
+ def __init__(self, logs_path_prefix, stats_freq_sec):
+ super().__init__()
+ self.logs_file_prefix = logs_path_prefix
+ self.stats_freq_sec = stats_freq_sec
+ self.duration_sec = 60
+
+ def get_keys_from_conditions(self, conditions):
+ # Note: case insensitive stat names
+ reqd_stats = []
+ for cond in conditions:
+ for key in cond.keys:
+ key = key.lower()
+ # some keys are prepended with '[]' for OdsStatsFetcher to
+ # replace this with the appropriate key_prefix, remove these
+ # characters here since the LogStatsParser does not need
+ # a prefix
+ if key.startswith('[]'):
+ reqd_stats.append(key[2:])
+ else:
+ reqd_stats.append(key)
+ return reqd_stats
+
+ def add_to_timeseries(self, log, reqd_stats):
+ # this method takes in the Log object that contains the Rocksdb stats
+ # and a list of required stats, then it parses the stats line by line
+ # to fetch required stats and add them to the keys_ts object
+ # Example: reqd_stats = ['rocksdb.block.cache.hit.count',
+ # 'rocksdb.db.get.micros.p99']
+ # Let log.get_message() returns following string:
+ # "[WARN] [db/db_impl.cc:485] STATISTICS:\n
+ # rocksdb.block.cache.miss COUNT : 1459\n
+ # rocksdb.block.cache.hit COUNT : 37\n
+ # ...
+ # rocksdb.db.get.micros P50 : 15.6 P95 : 39.7 P99 : 62.6 P100 : 148.0\n
+ # ..."
+ new_lines = log.get_message().split('\n')
+ # let log_ts = 1532518219
+ log_ts = log.get_timestamp()
+ # example updates to keys_ts:
+ # keys_ts[NO_ENTITY]['rocksdb.db.get.micros.p99'][1532518219] = 62.6
+ # keys_ts[NO_ENTITY]['rocksdb.block.cache.hit.count'][1532518219] = 37
+ for line in new_lines[1:]: # new_lines[0] does not contain any stats
+ stats_on_line = self.parse_log_line_for_stats(line)
+ for stat in stats_on_line:
+ if stat in reqd_stats:
+ if stat not in self.keys_ts[NO_ENTITY]:
+ self.keys_ts[NO_ENTITY][stat] = {}
+ self.keys_ts[NO_ENTITY][stat][log_ts] = stats_on_line[stat]
+
+ def fetch_timeseries(self, reqd_stats):
+ # this method parses the Rocksdb LOG file and generates timeseries for
+ # each of the statistic in the list reqd_stats
+ self.keys_ts = {NO_ENTITY: {}}
+ for file_name in glob.glob(self.logs_file_prefix + '*'):
+ # TODO(poojam23): find a way to distinguish between 'old' log files
+ # from current and previous experiments, present in the same
+ # directory
+ if re.search('old', file_name, re.IGNORECASE):
+ continue
+ with open(file_name, 'r') as db_logs:
+ new_log = None
+ for line in db_logs:
+ if Log.is_new_log(line):
+ if (
+ new_log and
+ re.search(self.STATS, new_log.get_message())
+ ):
+ self.add_to_timeseries(new_log, reqd_stats)
+ new_log = Log(line, column_families=[])
+ else:
+ # To account for logs split into multiple lines
+ new_log.append_message(line)
+ # Check for the last log in the file.
+ if new_log and re.search(self.STATS, new_log.get_message()):
+ self.add_to_timeseries(new_log, reqd_stats)
+
+
+class DatabasePerfContext(TimeSeriesData):
+ # TODO(poojam23): check if any benchrunner provides PerfContext sampled at
+ # regular intervals
+ def __init__(self, perf_context_ts, stats_freq_sec, cumulative):
+ '''
+ perf_context_ts is expected to be in the following format:
+ Dict[metric, Dict[timestamp, value]], where for
+ each (metric, timestamp) pair, the value is database-wide (i.e.
+ summed over all the threads involved)
+ if stats_freq_sec == 0, per-metric only one value is reported
+ '''
+ super().__init__()
+ self.stats_freq_sec = stats_freq_sec
+ self.keys_ts = {NO_ENTITY: perf_context_ts}
+ if cumulative:
+ self.unaccumulate_metrics()
+
+ def unaccumulate_metrics(self):
+ # if the perf context metrics provided are cumulative in nature, this
+ # method can be used to convert them to a disjoint format
+ epoch_ts = copy.deepcopy(self.keys_ts)
+ for stat in self.keys_ts[NO_ENTITY]:
+ timeseries = sorted(
+ list(self.keys_ts[NO_ENTITY][stat].keys()), reverse=True
+ )
+ if len(timeseries) < 2:
+ continue
+ for ix, ts in enumerate(timeseries[:-1]):
+ epoch_ts[NO_ENTITY][stat][ts] = (
+ epoch_ts[NO_ENTITY][stat][ts] -
+ epoch_ts[NO_ENTITY][stat][timeseries[ix+1]]
+ )
+ if epoch_ts[NO_ENTITY][stat][ts] < 0:
+ raise ValueError('DBPerfContext: really cumulative?')
+ # drop the smallest timestamp in the timeseries for this metric
+ epoch_ts[NO_ENTITY][stat].pop(timeseries[-1])
+ self.keys_ts = epoch_ts
+
+ def get_keys_from_conditions(self, conditions):
+ reqd_stats = []
+ for cond in conditions:
+ reqd_stats.extend([key.lower() for key in cond.keys])
+ return reqd_stats
+
+ def fetch_timeseries(self, statistics):
+ # this method is redundant for DatabasePerfContext because the __init__
+ # does the job of populating 'keys_ts'
+ pass
+
+
+class OdsStatsFetcher(TimeSeriesData):
+ # class constants
+ OUTPUT_FILE = 'temp/stats_out.tmp'
+ ERROR_FILE = 'temp/stats_err.tmp'
+ RAPIDO_COMMAND = "%s --entity=%s --key=%s --tstart=%s --tend=%s --showtime"
+
+ # static methods
+ @staticmethod
+ def _get_string_in_quotes(value):
+ return '"' + str(value) + '"'
+
+ @staticmethod
+ def _get_time_value_pair(pair_string):
+ # example pair_string: '[1532544591, 97.3653601828]'
+ pair_string = pair_string.replace('[', '')
+ pair_string = pair_string.replace(']', '')
+ pair = pair_string.split(',')
+ first = int(pair[0].strip())
+ second = float(pair[1].strip())
+ return [first, second]
+
+ @staticmethod
+ def _get_ods_cli_stime(start_time):
+ diff = int(time.time() - int(start_time))
+ stime = str(diff) + '_s'
+ return stime
+
+ def __init__(
+ self, client, entities, start_time, end_time, key_prefix=None
+ ):
+ super().__init__()
+ self.client = client
+ self.entities = entities
+ self.start_time = start_time
+ self.end_time = end_time
+ self.key_prefix = key_prefix
+ self.stats_freq_sec = 60
+ self.duration_sec = 60
+
+ def execute_script(self, command):
+ print('executing...')
+ print(command)
+ out_file = open(self.OUTPUT_FILE, "w+")
+ err_file = open(self.ERROR_FILE, "w+")
+ subprocess.call(command, shell=True, stdout=out_file, stderr=err_file)
+ out_file.close()
+ err_file.close()
+
+ def parse_rapido_output(self):
+ # Output looks like the following:
+ # <entity_name>\t<key_name>\t[[ts, value], [ts, value], ...]
+ # ts = timestamp; value = value of key_name in entity_name at time ts
+ self.keys_ts = {}
+ with open(self.OUTPUT_FILE, 'r') as fp:
+ for line in fp:
+ token_list = line.strip().split('\t')
+ entity = token_list[0]
+ key = token_list[1]
+ if entity not in self.keys_ts:
+ self.keys_ts[entity] = {}
+ if key not in self.keys_ts[entity]:
+ self.keys_ts[entity][key] = {}
+ list_of_lists = [
+ self._get_time_value_pair(pair_string)
+ for pair_string in token_list[2].split('],')
+ ]
+ value = {pair[0]: pair[1] for pair in list_of_lists}
+ self.keys_ts[entity][key] = value
+
+ def parse_ods_output(self):
+ # Output looks like the following:
+ # <entity_name>\t<key_name>\t<timestamp>\t<value>
+ # there is one line per (entity_name, key_name, timestamp)
+ self.keys_ts = {}
+ with open(self.OUTPUT_FILE, 'r') as fp:
+ for line in fp:
+ token_list = line.split()
+ entity = token_list[0]
+ if entity not in self.keys_ts:
+ self.keys_ts[entity] = {}
+ key = token_list[1]
+ if key not in self.keys_ts[entity]:
+ self.keys_ts[entity][key] = {}
+ self.keys_ts[entity][key][token_list[2]] = token_list[3]
+
+ def fetch_timeseries(self, statistics):
+ # this method fetches the timeseries of required stats from the ODS
+ # service and populates the 'keys_ts' object appropriately
+ print('OdsStatsFetcher: fetching ' + str(statistics))
+ if re.search('rapido', self.client, re.IGNORECASE):
+ command = self.RAPIDO_COMMAND % (
+ self.client,
+ self._get_string_in_quotes(self.entities),
+ self._get_string_in_quotes(','.join(statistics)),
+ self._get_string_in_quotes(self.start_time),
+ self._get_string_in_quotes(self.end_time)
+ )
+ # Run the tool and fetch the time-series data
+ self.execute_script(command)
+ # Parse output and populate the 'keys_ts' map
+ self.parse_rapido_output()
+ elif re.search('ods', self.client, re.IGNORECASE):
+ command = (
+ self.client + ' ' +
+ '--stime=' + self._get_ods_cli_stime(self.start_time) + ' ' +
+ self._get_string_in_quotes(self.entities) + ' ' +
+ self._get_string_in_quotes(','.join(statistics))
+ )
+ # Run the tool and fetch the time-series data
+ self.execute_script(command)
+ # Parse output and populate the 'keys_ts' map
+ self.parse_ods_output()
+
+ def get_keys_from_conditions(self, conditions):
+ reqd_stats = []
+ for cond in conditions:
+ for key in cond.keys:
+ use_prefix = False
+ if key.startswith('[]'):
+ use_prefix = True
+ key = key[2:]
+ # TODO(poojam23): this is very hacky and needs to be improved
+ if key.startswith("rocksdb"):
+ key += ".60"
+ if use_prefix:
+ if not self.key_prefix:
+ print('Warning: OdsStatsFetcher might need key prefix')
+ print('for the key: ' + key)
+ else:
+ key = self.key_prefix + "." + key
+ reqd_stats.append(key)
+ return reqd_stats
+
+ def fetch_rate_url(self, entities, keys, window_len, percent, display):
+ # type: (List[str], List[str], str, str, bool) -> str
+ transform_desc = (
+ "rate(" + str(window_len) + ",duration=" + str(self.duration_sec)
+ )
+ if percent:
+ transform_desc = transform_desc + ",%)"
+ else:
+ transform_desc = transform_desc + ")"
+ if re.search('rapido', self.client, re.IGNORECASE):
+ command = self.RAPIDO_COMMAND + " --transform=%s --url=%s"
+ command = command % (
+ self.client,
+ self._get_string_in_quotes(','.join(entities)),
+ self._get_string_in_quotes(','.join(keys)),
+ self._get_string_in_quotes(self.start_time),
+ self._get_string_in_quotes(self.end_time),
+ self._get_string_in_quotes(transform_desc),
+ self._get_string_in_quotes(display)
+ )
+ elif re.search('ods', self.client, re.IGNORECASE):
+ command = (
+ self.client + ' ' +
+ '--stime=' + self._get_ods_cli_stime(self.start_time) + ' ' +
+ '--fburlonly ' +
+ self._get_string_in_quotes(entities) + ' ' +
+ self._get_string_in_quotes(','.join(keys)) + ' ' +
+ self._get_string_in_quotes(transform_desc)
+ )
+ self.execute_script(command)
+ url = ""
+ with open(self.OUTPUT_FILE, 'r') as fp:
+ url = fp.readline()
+ return url
diff --git a/src/rocksdb/tools/advisor/advisor/db_timeseries_parser.py b/src/rocksdb/tools/advisor/advisor/db_timeseries_parser.py
new file mode 100644
index 000000000..308eb139a
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/db_timeseries_parser.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+from abc import abstractmethod
+from advisor.db_log_parser import DataSource
+from enum import Enum
+import math
+
+
+NO_ENTITY = 'ENTITY_PLACEHOLDER'
+
+
+class TimeSeriesData(DataSource):
+ class Behavior(Enum):
+ bursty = 1
+ evaluate_expression = 2
+
+ class AggregationOperator(Enum):
+ avg = 1
+ max = 2
+ min = 3
+ latest = 4
+ oldest = 5
+
+ def __init__(self):
+ super().__init__(DataSource.Type.TIME_SERIES)
+ self.keys_ts = None # Dict[entity, Dict[key, Dict[timestamp, value]]]
+ self.stats_freq_sec = None
+
+ @abstractmethod
+ def get_keys_from_conditions(self, conditions):
+ # This method takes in a list of time-series conditions; for each
+ # condition it manipulates the 'keys' in the way that is supported by
+ # the subclass implementing this method
+ pass
+
+ @abstractmethod
+ def fetch_timeseries(self, required_statistics):
+ # this method takes in a list of statistics and fetches the timeseries
+ # for each of them and populates the 'keys_ts' dictionary
+ pass
+
+ def fetch_burst_epochs(
+ self, entities, statistic, window_sec, threshold, percent
+ ):
+ # type: (str, int, float, bool) -> Dict[str, Dict[int, float]]
+ # this method calculates the (percent) rate change in the 'statistic'
+ # for each entity (over 'window_sec' seconds) and returns the epochs
+ # where this rate change is greater than or equal to the 'threshold'
+ # value
+ if self.stats_freq_sec == 0:
+ # not time series data, cannot check for bursty behavior
+ return
+ if window_sec < self.stats_freq_sec:
+ window_sec = self.stats_freq_sec
+ # 'window_samples' is the number of windows to go back to
+ # compare the current window with, while calculating rate change.
+ window_samples = math.ceil(window_sec / self.stats_freq_sec)
+ burst_epochs = {}
+ # if percent = False:
+ # curr_val = value at window for which rate change is being calculated
+ # prev_val = value at window that is window_samples behind curr_window
+ # Then rate_without_percent =
+ # ((curr_val-prev_val)*duration_sec)/(curr_timestamp-prev_timestamp)
+ # if percent = True:
+ # rate_with_percent = (rate_without_percent * 100) / prev_val
+ # These calculations are in line with the rate() transform supported
+ # by ODS
+ for entity in entities:
+ if statistic not in self.keys_ts[entity]:
+ continue
+ timestamps = sorted(list(self.keys_ts[entity][statistic].keys()))
+ for ix in range(window_samples, len(timestamps), 1):
+ first_ts = timestamps[ix - window_samples]
+ last_ts = timestamps[ix]
+ first_val = self.keys_ts[entity][statistic][first_ts]
+ last_val = self.keys_ts[entity][statistic][last_ts]
+ diff = last_val - first_val
+ if percent:
+ diff = diff * 100 / first_val
+ rate = (diff * self.duration_sec) / (last_ts - first_ts)
+ # if the rate change is greater than the provided threshold,
+ # then the condition is triggered for entity at time 'last_ts'
+ if rate >= threshold:
+ if entity not in burst_epochs:
+ burst_epochs[entity] = {}
+ burst_epochs[entity][last_ts] = rate
+ return burst_epochs
+
+ def fetch_aggregated_values(self, entity, statistics, aggregation_op):
+ # type: (str, AggregationOperator) -> Dict[str, float]
+ # this method performs the aggregation specified by 'aggregation_op'
+ # on the timeseries of 'statistics' for 'entity' and returns:
+ # Dict[statistic, aggregated_value]
+ result = {}
+ for stat in statistics:
+ if stat not in self.keys_ts[entity]:
+ continue
+ agg_val = None
+ if aggregation_op is self.AggregationOperator.latest:
+ latest_timestamp = max(list(self.keys_ts[entity][stat].keys()))
+ agg_val = self.keys_ts[entity][stat][latest_timestamp]
+ elif aggregation_op is self.AggregationOperator.oldest:
+ oldest_timestamp = min(list(self.keys_ts[entity][stat].keys()))
+ agg_val = self.keys_ts[entity][stat][oldest_timestamp]
+ elif aggregation_op is self.AggregationOperator.max:
+ agg_val = max(list(self.keys_ts[entity][stat].values()))
+ elif aggregation_op is self.AggregationOperator.min:
+ agg_val = min(list(self.keys_ts[entity][stat].values()))
+ elif aggregation_op is self.AggregationOperator.avg:
+ values = list(self.keys_ts[entity][stat].values())
+ agg_val = sum(values) / len(values)
+ result[stat] = agg_val
+ return result
+
+ def check_and_trigger_conditions(self, conditions):
+ # get the list of statistics that need to be fetched
+ reqd_keys = self.get_keys_from_conditions(conditions)
+ # fetch the required statistics and populate the map 'keys_ts'
+ self.fetch_timeseries(reqd_keys)
+ # Trigger the appropriate conditions
+ for cond in conditions:
+ complete_keys = self.get_keys_from_conditions([cond])
+ # Get the entities that have all statistics required by 'cond':
+ # an entity is checked for a given condition only if we possess all
+ # of the condition's 'keys' for that entity
+ entities_with_stats = []
+ for entity in self.keys_ts:
+ stat_missing = False
+ for stat in complete_keys:
+ if stat not in self.keys_ts[entity]:
+ stat_missing = True
+ break
+ if not stat_missing:
+ entities_with_stats.append(entity)
+ if not entities_with_stats:
+ continue
+ if cond.behavior is self.Behavior.bursty:
+ # for a condition that checks for bursty behavior, only one key
+ # should be present in the condition's 'keys' field
+ result = self.fetch_burst_epochs(
+ entities_with_stats,
+ complete_keys[0], # there should be only one key
+ cond.window_sec,
+ cond.rate_threshold,
+ True
+ )
+ # Trigger in this case is:
+ # Dict[entity_name, Dict[timestamp, rate_change]]
+ # where the inner dictionary contains rate_change values when
+ # the rate_change >= threshold provided, with the
+ # corresponding timestamps
+ if result:
+ cond.set_trigger(result)
+ elif cond.behavior is self.Behavior.evaluate_expression:
+ self.handle_evaluate_expression(
+ cond,
+ complete_keys,
+ entities_with_stats
+ )
+
+ def handle_evaluate_expression(self, condition, statistics, entities):
+ trigger = {}
+ # check 'condition' for each of these entities
+ for entity in entities:
+ if hasattr(condition, 'aggregation_op'):
+ # in this case, the aggregation operation is performed on each
+ # of the condition's 'keys' and then with aggregated values
+ # condition's 'expression' is evaluated; if it evaluates to
+ # True, then list of the keys values is added to the
+ # condition's trigger: Dict[entity_name, List[stats]]
+ result = self.fetch_aggregated_values(
+ entity, statistics, condition.aggregation_op
+ )
+ keys = [result[key] for key in statistics]
+ try:
+ if eval(condition.expression):
+ trigger[entity] = keys
+ except Exception as e:
+ print(
+ 'WARNING(TimeSeriesData) check_and_trigger: ' + str(e)
+ )
+ else:
+ # assumption: all stats have same series of timestamps
+ # this is similar to the above but 'expression' is evaluated at
+ # each timestamp, since there is no aggregation, and all the
+ # epochs are added to the trigger when the condition's
+ # 'expression' evaluated to true; so trigger is:
+ # Dict[entity, Dict[timestamp, List[stats]]]
+ for epoch in self.keys_ts[entity][statistics[0]].keys():
+ keys = [
+ self.keys_ts[entity][key][epoch]
+ for key in statistics
+ ]
+ try:
+ if eval(condition.expression):
+ if entity not in trigger:
+ trigger[entity] = {}
+ trigger[entity][epoch] = keys
+ except Exception as e:
+ print(
+ 'WARNING(TimeSeriesData) check_and_trigger: ' +
+ str(e)
+ )
+ if trigger:
+ condition.set_trigger(trigger)
diff --git a/src/rocksdb/tools/advisor/advisor/ini_parser.py b/src/rocksdb/tools/advisor/advisor/ini_parser.py
new file mode 100644
index 000000000..4776ef209
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/ini_parser.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+from enum import Enum
+
+
+class IniParser:
+ class Element(Enum):
+ rule = 1
+ cond = 2
+ sugg = 3
+ key_val = 4
+ comment = 5
+
+ @staticmethod
+ def remove_trailing_comment(line):
+ line = line.strip()
+ comment_start = line.find('#')
+ if comment_start > -1:
+ return line[:comment_start]
+ return line
+
+ @staticmethod
+ def is_section_header(line):
+ # A section header looks like: [Rule "my-new-rule"]. Essentially,
+ # a line that is in square-brackets.
+ line = line.strip()
+ if line.startswith('[') and line.endswith(']'):
+ return True
+ return False
+
+ @staticmethod
+ def get_section_name(line):
+ # For a section header: [Rule "my-new-rule"], this method will return
+ # "my-new-rule".
+ token_list = line.strip()[1:-1].split('"')
+ if len(token_list) < 3:
+ error = 'needed section header: [<section_type> "<section_name>"]'
+ raise ValueError('Parsing error: ' + error + '\n' + line)
+ return token_list[1]
+
+ @staticmethod
+ def get_element(line):
+ line = IniParser.remove_trailing_comment(line)
+ if not line:
+ return IniParser.Element.comment
+ if IniParser.is_section_header(line):
+ if line.strip()[1:-1].startswith('Suggestion'):
+ return IniParser.Element.sugg
+ if line.strip()[1:-1].startswith('Rule'):
+ return IniParser.Element.rule
+ if line.strip()[1:-1].startswith('Condition'):
+ return IniParser.Element.cond
+ if '=' in line:
+ return IniParser.Element.key_val
+ error = 'not a recognizable RulesSpec element'
+ raise ValueError('Parsing error: ' + error + '\n' + line)
+
+ @staticmethod
+ def get_key_value_pair(line):
+ line = line.strip()
+ key = line.split('=')[0].strip()
+ value = "=".join(line.split('=')[1:])
+ if value == "": # if the option has no value
+ return (key, None)
+ values = IniParser.get_list_from_value(value)
+ if len(values) == 1:
+ return (key, value)
+ return (key, values)
+
+ @staticmethod
+ def get_list_from_value(value):
+ values = value.strip().split(':')
+ return values
diff --git a/src/rocksdb/tools/advisor/advisor/rule_parser.py b/src/rocksdb/tools/advisor/advisor/rule_parser.py
new file mode 100644
index 000000000..592218f4a
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/rule_parser.py
@@ -0,0 +1,528 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+from abc import ABC, abstractmethod
+from advisor.db_log_parser import DataSource, NO_COL_FAMILY
+from advisor.db_timeseries_parser import TimeSeriesData
+from enum import Enum
+from advisor.ini_parser import IniParser
+import re
+
+
+class Section(ABC):
+ def __init__(self, name):
+ self.name = name
+
+ @abstractmethod
+ def set_parameter(self, key, value):
+ pass
+
+ @abstractmethod
+ def perform_checks(self):
+ pass
+
+
+class Rule(Section):
+ def __init__(self, name):
+ super().__init__(name)
+ self.conditions = None
+ self.suggestions = None
+ self.overlap_time_seconds = None
+ self.trigger_entities = None
+ self.trigger_column_families = None
+
+ def set_parameter(self, key, value):
+ # If the Rule is associated with a single suggestion/condition, then
+ # value will be a string and not a list. Hence, convert it to a single
+ # element list before storing it in self.suggestions or
+ # self.conditions.
+ if key == 'conditions':
+ if isinstance(value, str):
+ self.conditions = [value]
+ else:
+ self.conditions = value
+ elif key == 'suggestions':
+ if isinstance(value, str):
+ self.suggestions = [value]
+ else:
+ self.suggestions = value
+ elif key == 'overlap_time_period':
+ self.overlap_time_seconds = value
+
+ def get_suggestions(self):
+ return self.suggestions
+
+ def perform_checks(self):
+ if not self.conditions or len(self.conditions) < 1:
+ raise ValueError(
+ self.name + ': rule must have at least one condition'
+ )
+ if not self.suggestions or len(self.suggestions) < 1:
+ raise ValueError(
+ self.name + ': rule must have at least one suggestion'
+ )
+ if self.overlap_time_seconds:
+ if len(self.conditions) != 2:
+ raise ValueError(
+ self.name + ": rule must be associated with 2 conditions\
+ in order to check for a time dependency between them"
+ )
+ time_format = '^\d+[s|m|h|d]$'
+ if (
+ not
+ re.match(time_format, self.overlap_time_seconds, re.IGNORECASE)
+ ):
+ raise ValueError(
+ self.name + ": overlap_time_seconds format: \d+[s|m|h|d]"
+ )
+ else: # convert to seconds
+ in_seconds = int(self.overlap_time_seconds[:-1])
+ if self.overlap_time_seconds[-1] == 'm':
+ in_seconds *= 60
+ elif self.overlap_time_seconds[-1] == 'h':
+ in_seconds *= (60 * 60)
+ elif self.overlap_time_seconds[-1] == 'd':
+ in_seconds *= (24 * 60 * 60)
+ self.overlap_time_seconds = in_seconds
+
+ def get_overlap_timestamps(self, key1_trigger_epochs, key2_trigger_epochs):
+ # this method takes in 2 timeseries i.e. timestamps at which the
+ # rule's 2 TIME_SERIES conditions were triggered and it finds
+ # (if present) the first pair of timestamps at which the 2 conditions
+ # were triggered within 'overlap_time_seconds' of each other
+ key1_lower_bounds = [
+ epoch - self.overlap_time_seconds
+ for epoch in key1_trigger_epochs
+ ]
+ key1_lower_bounds.sort()
+ key2_trigger_epochs.sort()
+ trigger_ix = 0
+ overlap_pair = None
+ for key1_lb in key1_lower_bounds:
+ while (
+ key2_trigger_epochs[trigger_ix] < key1_lb and
+ trigger_ix < len(key2_trigger_epochs)
+ ):
+ trigger_ix += 1
+ if trigger_ix >= len(key2_trigger_epochs):
+ break
+ if (
+ key2_trigger_epochs[trigger_ix] <=
+ key1_lb + (2 * self.overlap_time_seconds)
+ ):
+ overlap_pair = (
+ key2_trigger_epochs[trigger_ix],
+ key1_lb + self.overlap_time_seconds
+ )
+ break
+ return overlap_pair
+
+ def get_trigger_entities(self):
+ return self.trigger_entities
+
+ def get_trigger_column_families(self):
+ return self.trigger_column_families
+
+ def is_triggered(self, conditions_dict, column_families):
+ if self.overlap_time_seconds:
+ condition1 = conditions_dict[self.conditions[0]]
+ condition2 = conditions_dict[self.conditions[1]]
+ if not (
+ condition1.get_data_source() is DataSource.Type.TIME_SERIES and
+ condition2.get_data_source() is DataSource.Type.TIME_SERIES
+ ):
+ raise ValueError(self.name + ': need 2 timeseries conditions')
+
+ map1 = condition1.get_trigger()
+ map2 = condition2.get_trigger()
+ if not (map1 and map2):
+ return False
+
+ self.trigger_entities = {}
+ is_triggered = False
+ entity_intersection = (
+ set(map1.keys()).intersection(set(map2.keys()))
+ )
+ for entity in entity_intersection:
+ overlap_timestamps_pair = (
+ self.get_overlap_timestamps(
+ list(map1[entity].keys()), list(map2[entity].keys())
+ )
+ )
+ if overlap_timestamps_pair:
+ self.trigger_entities[entity] = overlap_timestamps_pair
+ is_triggered = True
+ if is_triggered:
+ self.trigger_column_families = set(column_families)
+ return is_triggered
+ else:
+ all_conditions_triggered = True
+ self.trigger_column_families = set(column_families)
+ for cond_name in self.conditions:
+ cond = conditions_dict[cond_name]
+ if not cond.get_trigger():
+ all_conditions_triggered = False
+ break
+ if (
+ cond.get_data_source() is DataSource.Type.LOG or
+ cond.get_data_source() is DataSource.Type.DB_OPTIONS
+ ):
+ cond_col_fam = set(cond.get_trigger().keys())
+ if NO_COL_FAMILY in cond_col_fam:
+ cond_col_fam = set(column_families)
+ self.trigger_column_families = (
+ self.trigger_column_families.intersection(cond_col_fam)
+ )
+ elif cond.get_data_source() is DataSource.Type.TIME_SERIES:
+ cond_entities = set(cond.get_trigger().keys())
+ if self.trigger_entities is None:
+ self.trigger_entities = cond_entities
+ else:
+ self.trigger_entities = (
+ self.trigger_entities.intersection(cond_entities)
+ )
+ if not (self.trigger_entities or self.trigger_column_families):
+ all_conditions_triggered = False
+ break
+ if not all_conditions_triggered: # clean up if rule not triggered
+ self.trigger_column_families = None
+ self.trigger_entities = None
+ return all_conditions_triggered
+
+ def __repr__(self):
+ # Append conditions
+ rule_string = "Rule: " + self.name + " has conditions:: "
+ is_first = True
+ for cond in self.conditions:
+ if is_first:
+ rule_string += cond
+ is_first = False
+ else:
+ rule_string += (" AND " + cond)
+ # Append suggestions
+ rule_string += "\nsuggestions:: "
+ is_first = True
+ for sugg in self.suggestions:
+ if is_first:
+ rule_string += sugg
+ is_first = False
+ else:
+ rule_string += (", " + sugg)
+ if self.trigger_entities:
+ rule_string += (', entities:: ' + str(self.trigger_entities))
+ if self.trigger_column_families:
+ rule_string += (', col_fam:: ' + str(self.trigger_column_families))
+ # Return constructed string
+ return rule_string
+
+
+class Suggestion(Section):
+ class Action(Enum):
+ set = 1
+ increase = 2
+ decrease = 3
+
+ def __init__(self, name):
+ super().__init__(name)
+ self.option = None
+ self.action = None
+ self.suggested_values = None
+ self.description = None
+
+ def set_parameter(self, key, value):
+ if key == 'option':
+ # Note:
+ # case 1: 'option' is supported by Rocksdb OPTIONS file; in this
+ # case the option belongs to one of the sections in the config
+ # file and it's name is prefixed by "<section_type>."
+ # case 2: 'option' is not supported by Rocksdb OPTIONS file; the
+ # option is not expected to have the character '.' in its name
+ self.option = value
+ elif key == 'action':
+ if self.option and not value:
+ raise ValueError(self.name + ': provide action for option')
+ self.action = self.Action[value]
+ elif key == 'suggested_values':
+ if isinstance(value, str):
+ self.suggested_values = [value]
+ else:
+ self.suggested_values = value
+ elif key == 'description':
+ self.description = value
+
+ def perform_checks(self):
+ if not self.description:
+ if not self.option:
+ raise ValueError(self.name + ': provide option or description')
+ if not self.action:
+ raise ValueError(self.name + ': provide action for option')
+ if self.action is self.Action.set and not self.suggested_values:
+ raise ValueError(
+ self.name + ': provide suggested value for option'
+ )
+
+ def __repr__(self):
+ sugg_string = "Suggestion: " + self.name
+ if self.description:
+ sugg_string += (' description : ' + self.description)
+ else:
+ sugg_string += (
+ ' option : ' + self.option + ' action : ' + self.action.name
+ )
+ if self.suggested_values:
+ sugg_string += (
+ ' suggested_values : ' + str(self.suggested_values)
+ )
+ return sugg_string
+
+
+class Condition(Section):
+ def __init__(self, name):
+ super().__init__(name)
+ self.data_source = None
+ self.trigger = None
+
+ def perform_checks(self):
+ if not self.data_source:
+ raise ValueError(self.name + ': condition not tied to data source')
+
+ def set_data_source(self, data_source):
+ self.data_source = data_source
+
+ def get_data_source(self):
+ return self.data_source
+
+ def reset_trigger(self):
+ self.trigger = None
+
+ def set_trigger(self, condition_trigger):
+ self.trigger = condition_trigger
+
+ def get_trigger(self):
+ return self.trigger
+
+ def is_triggered(self):
+ if self.trigger:
+ return True
+ return False
+
+ def set_parameter(self, key, value):
+ # must be defined by the subclass
+ raise NotImplementedError(self.name + ': provide source for condition')
+
+
+class LogCondition(Condition):
+ @classmethod
+ def create(cls, base_condition):
+ base_condition.set_data_source(DataSource.Type['LOG'])
+ base_condition.__class__ = cls
+ return base_condition
+
+ def set_parameter(self, key, value):
+ if key == 'regex':
+ self.regex = value
+
+ def perform_checks(self):
+ super().perform_checks()
+ if not self.regex:
+ raise ValueError(self.name + ': provide regex for log condition')
+
+ def __repr__(self):
+ log_cond_str = "LogCondition: " + self.name
+ log_cond_str += (" regex: " + self.regex)
+ # if self.trigger:
+ # log_cond_str += (" trigger: " + str(self.trigger))
+ return log_cond_str
+
+
+class OptionCondition(Condition):
+ @classmethod
+ def create(cls, base_condition):
+ base_condition.set_data_source(DataSource.Type['DB_OPTIONS'])
+ base_condition.__class__ = cls
+ return base_condition
+
+ def set_parameter(self, key, value):
+ if key == 'options':
+ if isinstance(value, str):
+ self.options = [value]
+ else:
+ self.options = value
+ elif key == 'evaluate':
+ self.eval_expr = value
+
+ def perform_checks(self):
+ super().perform_checks()
+ if not self.options:
+ raise ValueError(self.name + ': options missing in condition')
+ if not self.eval_expr:
+ raise ValueError(self.name + ': expression missing in condition')
+
+ def __repr__(self):
+ opt_cond_str = "OptionCondition: " + self.name
+ opt_cond_str += (" options: " + str(self.options))
+ opt_cond_str += (" expression: " + self.eval_expr)
+ if self.trigger:
+ opt_cond_str += (" trigger: " + str(self.trigger))
+ return opt_cond_str
+
+
+class TimeSeriesCondition(Condition):
+ @classmethod
+ def create(cls, base_condition):
+ base_condition.set_data_source(DataSource.Type['TIME_SERIES'])
+ base_condition.__class__ = cls
+ return base_condition
+
+ def set_parameter(self, key, value):
+ if key == 'keys':
+ if isinstance(value, str):
+ self.keys = [value]
+ else:
+ self.keys = value
+ elif key == 'behavior':
+ self.behavior = TimeSeriesData.Behavior[value]
+ elif key == 'rate_threshold':
+ self.rate_threshold = float(value)
+ elif key == 'window_sec':
+ self.window_sec = int(value)
+ elif key == 'evaluate':
+ self.expression = value
+ elif key == 'aggregation_op':
+ self.aggregation_op = TimeSeriesData.AggregationOperator[value]
+
+ def perform_checks(self):
+ if not self.keys:
+ raise ValueError(self.name + ': specify timeseries key')
+ if not self.behavior:
+ raise ValueError(self.name + ': specify triggering behavior')
+ if self.behavior is TimeSeriesData.Behavior.bursty:
+ if not self.rate_threshold:
+ raise ValueError(self.name + ': specify rate burst threshold')
+ if not self.window_sec:
+ self.window_sec = 300 # default window length is 5 minutes
+ if len(self.keys) > 1:
+ raise ValueError(self.name + ': specify only one key')
+ elif self.behavior is TimeSeriesData.Behavior.evaluate_expression:
+ if not (self.expression):
+ raise ValueError(self.name + ': specify evaluation expression')
+ else:
+ raise ValueError(self.name + ': trigger behavior not supported')
+
+ def __repr__(self):
+ ts_cond_str = "TimeSeriesCondition: " + self.name
+ ts_cond_str += (" statistics: " + str(self.keys))
+ ts_cond_str += (" behavior: " + self.behavior.name)
+ if self.behavior is TimeSeriesData.Behavior.bursty:
+ ts_cond_str += (" rate_threshold: " + str(self.rate_threshold))
+ ts_cond_str += (" window_sec: " + str(self.window_sec))
+ if self.behavior is TimeSeriesData.Behavior.evaluate_expression:
+ ts_cond_str += (" expression: " + self.expression)
+ if hasattr(self, 'aggregation_op'):
+ ts_cond_str += (" aggregation_op: " + self.aggregation_op.name)
+ if self.trigger:
+ ts_cond_str += (" trigger: " + str(self.trigger))
+ return ts_cond_str
+
+
+class RulesSpec:
+ def __init__(self, rules_path):
+ self.file_path = rules_path
+
+ def initialise_fields(self):
+ self.rules_dict = {}
+ self.conditions_dict = {}
+ self.suggestions_dict = {}
+
+ def perform_section_checks(self):
+ for rule in self.rules_dict.values():
+ rule.perform_checks()
+ for cond in self.conditions_dict.values():
+ cond.perform_checks()
+ for sugg in self.suggestions_dict.values():
+ sugg.perform_checks()
+
+ def load_rules_from_spec(self):
+ self.initialise_fields()
+ with open(self.file_path, 'r') as db_rules:
+ curr_section = None
+ for line in db_rules:
+ line = IniParser.remove_trailing_comment(line)
+ if not line:
+ continue
+ element = IniParser.get_element(line)
+ if element is IniParser.Element.comment:
+ continue
+ elif element is not IniParser.Element.key_val:
+ curr_section = element # it's a new IniParser header
+ section_name = IniParser.get_section_name(line)
+ if element is IniParser.Element.rule:
+ new_rule = Rule(section_name)
+ self.rules_dict[section_name] = new_rule
+ elif element is IniParser.Element.cond:
+ new_cond = Condition(section_name)
+ self.conditions_dict[section_name] = new_cond
+ elif element is IniParser.Element.sugg:
+ new_suggestion = Suggestion(section_name)
+ self.suggestions_dict[section_name] = new_suggestion
+ elif element is IniParser.Element.key_val:
+ key, value = IniParser.get_key_value_pair(line)
+ if curr_section is IniParser.Element.rule:
+ new_rule.set_parameter(key, value)
+ elif curr_section is IniParser.Element.cond:
+ if key == 'source':
+ if value == 'LOG':
+ new_cond = LogCondition.create(new_cond)
+ elif value == 'OPTIONS':
+ new_cond = OptionCondition.create(new_cond)
+ elif value == 'TIME_SERIES':
+ new_cond = TimeSeriesCondition.create(new_cond)
+ else:
+ new_cond.set_parameter(key, value)
+ elif curr_section is IniParser.Element.sugg:
+ new_suggestion.set_parameter(key, value)
+
+ def get_rules_dict(self):
+ return self.rules_dict
+
+ def get_conditions_dict(self):
+ return self.conditions_dict
+
+ def get_suggestions_dict(self):
+ return self.suggestions_dict
+
+ def get_triggered_rules(self, data_sources, column_families):
+ self.trigger_conditions(data_sources)
+ triggered_rules = []
+ for rule in self.rules_dict.values():
+ if rule.is_triggered(self.conditions_dict, column_families):
+ triggered_rules.append(rule)
+ return triggered_rules
+
+ def trigger_conditions(self, data_sources):
+ for source_type in data_sources:
+ cond_subset = [
+ cond
+ for cond in self.conditions_dict.values()
+ if cond.get_data_source() is source_type
+ ]
+ if not cond_subset:
+ continue
+ for source in data_sources[source_type]:
+ source.check_and_trigger_conditions(cond_subset)
+
+ def print_rules(self, rules):
+ for rule in rules:
+ print('\nRule: ' + rule.name)
+ for cond_name in rule.conditions:
+ print(repr(self.conditions_dict[cond_name]))
+ for sugg_name in rule.suggestions:
+ print(repr(self.suggestions_dict[sugg_name]))
+ if rule.trigger_entities:
+ print('scope: entities:')
+ print(rule.trigger_entities)
+ if rule.trigger_column_families:
+ print('scope: col_fam:')
+ print(rule.trigger_column_families)
diff --git a/src/rocksdb/tools/advisor/advisor/rule_parser_example.py b/src/rocksdb/tools/advisor/advisor/rule_parser_example.py
new file mode 100644
index 000000000..d2348e5ae
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/rule_parser_example.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+from advisor.rule_parser import RulesSpec
+from advisor.db_log_parser import DatabaseLogs, DataSource
+from advisor.db_options_parser import DatabaseOptions
+from advisor.db_stats_fetcher import LogStatsParser, OdsStatsFetcher
+import argparse
+
+
+def main(args):
+ # initialise the RulesSpec parser
+ rule_spec_parser = RulesSpec(args.rules_spec)
+ rule_spec_parser.load_rules_from_spec()
+ rule_spec_parser.perform_section_checks()
+ # initialize the DatabaseOptions object
+ db_options = DatabaseOptions(args.rocksdb_options)
+ # Create DatabaseLogs object
+ db_logs = DatabaseLogs(
+ args.log_files_path_prefix, db_options.get_column_families()
+ )
+ # Create the Log STATS object
+ db_log_stats = LogStatsParser(
+ args.log_files_path_prefix, args.stats_dump_period_sec
+ )
+ data_sources = {
+ DataSource.Type.DB_OPTIONS: [db_options],
+ DataSource.Type.LOG: [db_logs],
+ DataSource.Type.TIME_SERIES: [db_log_stats]
+ }
+ if args.ods_client:
+ data_sources[DataSource.Type.TIME_SERIES].append(OdsStatsFetcher(
+ args.ods_client,
+ args.ods_entity,
+ args.ods_tstart,
+ args.ods_tend,
+ args.ods_key_prefix
+ ))
+ triggered_rules = rule_spec_parser.get_triggered_rules(
+ data_sources, db_options.get_column_families()
+ )
+ rule_spec_parser.print_rules(triggered_rules)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Use this script to get\
+ suggestions for improving Rocksdb performance.')
+ parser.add_argument(
+ '--rules_spec', required=True, type=str,
+ help='path of the file containing the expert-specified Rules'
+ )
+ parser.add_argument(
+ '--rocksdb_options', required=True, type=str,
+ help='path of the starting Rocksdb OPTIONS file'
+ )
+ parser.add_argument(
+ '--log_files_path_prefix', required=True, type=str,
+ help='path prefix of the Rocksdb LOG files'
+ )
+ parser.add_argument(
+ '--stats_dump_period_sec', required=True, type=int,
+ help='the frequency (in seconds) at which STATISTICS are printed to ' +
+ 'the Rocksdb LOG file'
+ )
+ # ODS arguments
+ parser.add_argument(
+ '--ods_client', type=str, help='the ODS client binary'
+ )
+ parser.add_argument(
+ '--ods_entity', type=str,
+ help='the servers for which the ODS stats need to be fetched'
+ )
+ parser.add_argument(
+ '--ods_key_prefix', type=str,
+ help='the prefix that needs to be attached to the keys of time ' +
+ 'series to be fetched from ODS'
+ )
+ parser.add_argument(
+ '--ods_tstart', type=int,
+ help='start time of timeseries to be fetched from ODS'
+ )
+ parser.add_argument(
+ '--ods_tend', type=int,
+ help='end time of timeseries to be fetched from ODS'
+ )
+ args = parser.parse_args()
+ main(args)
diff --git a/src/rocksdb/tools/advisor/advisor/rules.ini b/src/rocksdb/tools/advisor/advisor/rules.ini
new file mode 100644
index 000000000..ec7a07e60
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/rules.ini
@@ -0,0 +1,214 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+#
+# FORMAT: very similar to the Rocksdb ini file in terms of syntax
+# (refer rocksdb/examples/rocksdb_option_file_example.ini)
+#
+# The Rules INI file is made up of multiple sections and each section is made
+# up of multiple key-value pairs. The recognized section types are:
+# Rule, Suggestion, Condition. Each section must have a name specified in ""
+# in the section header. This name acts as an identifier in that section
+# type's namespace. A section header looks like:
+# [<section_type> "<section_name_identifier>"]
+#
+# There should be at least one Rule section in the file with its corresponding
+# Condition and Suggestion sections. A Rule is triggered only when all of its
+# conditions are triggered. The order in which a Rule's conditions and
+# suggestions are specified has no significance.
+#
+# A Condition must be associated with a data source specified by the parameter
+# 'source' and this must be the first parameter specified for the Condition.
+# A condition can be associated with one or more Rules.
+#
+# A Suggestion is an advised change to a Rocksdb option to improve the
+# performance of the database in some way. Every suggestion can be a part of
+# one or more Rules.
+
+[Rule "stall-too-many-memtables"]
+suggestions=inc-bg-flush:inc-write-buffer
+conditions=stall-too-many-memtables
+
+[Condition "stall-too-many-memtables"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Rule "stall-too-many-L0"]
+suggestions=inc-max-subcompactions:inc-max-bg-compactions:inc-write-buffer-size:dec-max-bytes-for-level-base:inc-l0-slowdown-writes-trigger
+conditions=stall-too-many-L0
+
+[Condition "stall-too-many-L0"]
+source=LOG
+regex=Stalling writes because we have \d+ level-0 files
+
+[Rule "stop-too-many-L0"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-l0-stop-writes-trigger
+conditions=stop-too-many-L0
+
+[Condition "stop-too-many-L0"]
+source=LOG
+regex=Stopping writes because we have \d+ level-0 files
+
+[Rule "stall-too-many-compaction-bytes"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-hard-pending-compaction-bytes-limit:inc-soft-pending-compaction-bytes-limit
+conditions=stall-too-many-compaction-bytes
+
+[Condition "stall-too-many-compaction-bytes"]
+source=LOG
+regex=Stalling writes because of estimated pending compaction bytes \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+suggested_values=2
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
+
+[Suggestion "inc-max-subcompactions"]
+option=DBOptions.max_subcompactions
+action=increase
+
+[Suggestion "inc-max-bg-compactions"]
+option=DBOptions.max_background_compactions
+action=increase
+suggested_values=2
+
+[Suggestion "inc-write-buffer-size"]
+option=CFOptions.write_buffer_size
+action=increase
+
+[Suggestion "dec-max-bytes-for-level-base"]
+option=CFOptions.max_bytes_for_level_base
+action=decrease
+
+[Suggestion "inc-l0-slowdown-writes-trigger"]
+option=CFOptions.level0_slowdown_writes_trigger
+action=increase
+
+[Suggestion "inc-l0-stop-writes-trigger"]
+option=CFOptions.level0_stop_writes_trigger
+action=increase
+
+[Suggestion "inc-hard-pending-compaction-bytes-limit"]
+option=CFOptions.hard_pending_compaction_bytes_limit
+action=increase
+
+[Suggestion "inc-soft-pending-compaction-bytes-limit"]
+option=CFOptions.soft_pending_compaction_bytes_limit
+action=increase
+
+[Rule "level0-level1-ratio"]
+conditions=level0-level1-ratio
+suggestions=inc-base-max-bytes
+
+[Condition "level0-level1-ratio"]
+source=OPTIONS
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base
+evaluate=int(options[0])*int(options[1])-int(options[2])>=1 # should evaluate to a boolean, condition triggered if evaluates to true
+
+[Suggestion "inc-base-max-bytes"]
+option=CFOptions.max_bytes_for_level_base
+action=increase
+
+[Rules "tuning-iostat-burst"]
+conditions=large-db-get-p99
+suggestions=bytes-per-sync-non0:wal-bytes-per-sync-non0:set-rate-limiter
+#overlap_time_period=10m
+
+[Condition "write-burst"]
+source=TIME_SERIES
+keys=dyno.flash_write_bytes_per_sec
+behavior=bursty
+window_sec=300 # the smaller this window, the more sensitivity to changes in the time series, so the rate_threshold should be bigger; when it's 60, then same as diff(%)
+rate_threshold=20
+
+[Condition "large-p99-read-latency"]
+source=TIME_SERIES
+keys=[]rocksdb.read.block.get.micros.p99
+behavior=bursty
+window_sec=300
+rate_threshold=10
+
+[Condition "large-db-get-p99"]
+source=TIME_SERIES
+keys=[]rocksdb.db.get.micros.p50:[]rocksdb.db.get.micros.p99
+behavior=evaluate_expression
+evaluate=(keys[1]/keys[0])>5
+
+[Suggestion "bytes-per-sync-non0"]
+option=DBOptions.bytes_per_sync
+action=set
+suggested_values=1048576
+
+[Suggestion "wal-bytes-per-sync-non0"]
+option=DBOptions.wal_bytes_per_sync
+action=set
+suggested_values=1048576
+
+[Suggestion "set-rate-limiter"]
+option=rate_limiter_bytes_per_sec
+action=set
+suggested_values=1024000
+
+[Rule "bloom-filter-percent-useful"]
+conditions=bloom-filter-percent-useful
+suggestions=inc-bloom-bits-per-key
+
+[Condition "bloom-filter-percent-useful"]
+source=TIME_SERIES
+keys=[]rocksdb.bloom.filter.useful.count:[]rocksdb.bloom.filter.full.positive.count:[]rocksdb.bloom.filter.full.true.positive.count
+behavior=evaluate_expression
+evaluate=((keys[0]+keys[2])/(keys[0]+keys[1]))<0.9 # should evaluate to a boolean
+aggregation_op=latest
+
+[Rule "bloom-not-enabled"]
+conditions=bloom-not-enabled
+suggestions=inc-bloom-bits-per-key
+
+[Condition "bloom-not-enabled"]
+source=TIME_SERIES
+keys=[]rocksdb.bloom.filter.useful.count:[]rocksdb.bloom.filter.full.positive.count:[]rocksdb.bloom.filter.full.true.positive.count
+behavior=evaluate_expression
+evaluate=keys[0]+keys[1]+keys[2]==0
+aggregation_op=avg
+
+[Suggestion "inc-bloom-bits-per-key"]
+option=bloom_bits
+action=increase
+suggested_values=2
+
+[Rule "small-l0-files"]
+conditions=small-l0-files
+suggestions=dec-max-bytes-for-level-base:inc-write-buffer-size
+
+[Condition "small-l0-files"]
+source=OPTIONS
+options=CFOptions.max_bytes_for_level_base:CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size
+evaluate=int(options[0])>(10*int(options[1])*int(options[2]))
+
+[Rule "decompress-time-long"]
+conditions=decompress-time-long
+suggestions=dec-block-size:inc-block-cache-size:faster-compression-type
+
+[Condition "decompress-time-long"]
+source=TIME_SERIES
+keys=block_decompress_time:block_read_time:block_checksum_time
+behavior=evaluate_expression
+evaluate=(keys[0]/(keys[0]+keys[1]+keys[2]))>0.3
+
+[Suggestion "dec-block-size"]
+option=TableOptions.BlockBasedTable.block_size
+action=decrease
+
+[Suggestion "inc-block-cache-size"]
+option=cache_size
+action=increase
+suggested_values=16000000
+
+[Suggestion "faster-compression-type"]
+option=CFOptions.compression
+action=set
+suggested_values=kLZ4Compression
diff --git a/src/rocksdb/tools/advisor/test/__init__.py b/src/rocksdb/tools/advisor/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/__init__.py
diff --git a/src/rocksdb/tools/advisor/test/input_files/LOG-0 b/src/rocksdb/tools/advisor/test/input_files/LOG-0
new file mode 100644
index 000000000..3c9d51641
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/LOG-0
@@ -0,0 +1,30 @@
+2018/05/25-14:30:05.601692 7f82bd676200 RocksDB version: 5.14.0
+2018/05/25-14:30:07.626719 7f82ba72e700 (Original Log Time 2018/05/25-14:30:07.621966) [db/db_impl_compaction_flush.cc:1424] Calling FlushMemTableToOutputFile with column family [default], flush slots available 1, compaction slots available 1, flush slots scheduled 1, compaction slots scheduled 0
+2018/05/25-14:30:07.626725 7f82ba72e700 [db/flush_job.cc:301] [default] [JOB 3] Flushing memtable with next log file: 8
+2018/05/25-14:30:07.626738 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283807626732, "job": 3, "event": "flush_started", "num_memtables": 1, "num_entries": 28018, "num_deletes": 0, "memory_usage": 4065512, "flush_reason": "Write Buffer Full"}
+2018/05/25-14:30:07.626740 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 3] Level-0 flush table #10: started
+2018/05/25-14:30:07.764232 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #11. Immutable memtables: 1.
+2018/05/25-14:30:07.764240 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2
+2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stalling writes because we have 4 level-0 files rate 39886
+2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stopping writes because we have 4 level-0 files rate 39886
+2018/05/25-14:30:09.398302 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283809398276, "cf_name": "default", "job": 3, "event": "table_file_creation", "file_number": 10, "file_size": 1890434, "table_properties": {"data_size": 1876749, "index_size": 23346, "filter_size": 0, "raw_key_size": 663120, "raw_average_key_size": 24, "raw_value_size": 2763000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27630, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:30:09.398351 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 3] Level-0 flush table #10: 1890434 bytes OK
+2018/05/25-14:30:25.491635 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table #23: started
+2018/05/25-14:30:25.643618 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #24. Immutable memtables: 1.
+2018/05/25-14:30:25.643633 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2
+2018/05/25-14:30:27.288181 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283827288158, "cf_name": "default", "job": 10, "event": "table_file_creation", "file_number": 23, "file_size": 1893200, "table_properties": {"data_size": 1879460, "index_size": 23340, "filter_size": 0, "raw_key_size": 663360, "raw_average_key_size": 24, "raw_value_size": 2764000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27640, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:30:27.288210 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 10] Level-0 flush table #23: 1893200 bytes OK
+2018/05/25-14:30:27.289353 7f82ba72e700 [WARN] [db/column_family.cc:764] [default] Stalling writes because of estimated pending compaction bytes 14410584
+2018/05/25-14:30:27.289390 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.288829) [db/memtable_list.cc:377] [default] Level-0 commit table #23 started
+2018/05/25-14:30:27.289393 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.289332) [db/memtable_list.cc:409] [default] Level-0 commit table #23: memtable #1 done
+2018/05/25-14:34:21.047206 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527284061047181, "cf_name": "default", "job": 44, "event": "table_file_creation", "file_number": 84, "file_size": 1890780, "table_properties": {"data_size": 1877100, "index_size": 23309, "filter_size": 0, "raw_key_size": 662808, "raw_average_key_size": 24, "raw_value_size": 2761700, "raw_average_value_size": 100, "num_data_blocks": 837, "num_entries": 27617, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 44] Level-0 flush table #84: 1890780 bytes OK
+2018/05/25-14:34:21.048017 7f82ba72e700 (Original Log Time 2018/05/25-14:34:21.048005) EVENT_LOG_v1 {"time_micros": 1527284061047997, "job": 44, "event": "flush_finished", "output_compression": "Snappy", "lsm_state": [2, 1, 0, 0, 0, 0, 0], "immutable_memtables": 1}
+2018/05/25-14:34:21.048592 7f82bd676200 [DEBUG] [db/db_impl_files.cc:261] [JOB 45] Delete /tmp/rocksdbtest-155919/dbbench/000084.sst type=2 #84 -- OK
+2018/05/25-14:34:21.048603 7f82bd676200 EVENT_LOG_v1 {"time_micros": 1527284061048600, "job": 45, "event": "table_file_deletion", "file_number": 84}
+2018/05/25-14:34:21.048981 7f82bd676200 [db/db_impl.cc:398] Shutdown complete
+2018/05/25-14:34:21.049000 7f82bd676200 [db/db_impl.cc:563] [col-fam-A] random log message for testing
+2018/05/25-14:34:21.049010 7f82bd676200 [db/db_impl.cc:234] [col-fam-B] log continuing on next line
+remaining part of the log
+2018/05/25-14:34:21.049020 7f82bd676200 [db/db_impl.cc:653] [col-fam-A] another random log message
+2018/05/25-14:34:21.049025 7f82bd676200 [db/db_impl.cc:331] [unknown] random log message no column family
diff --git a/src/rocksdb/tools/advisor/test/input_files/LOG-1 b/src/rocksdb/tools/advisor/test/input_files/LOG-1
new file mode 100644
index 000000000..b163f9a99
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/LOG-1
@@ -0,0 +1,25 @@
+2018/05/25-14:30:05.601692 7f82bd676200 RocksDB version: 5.14.0
+2018/05/25-14:30:07.626719 7f82ba72e700 (Original Log Time 2018/05/25-14:30:07.621966) [db/db_impl_compaction_flush.cc:1424] Calling FlushMemTableToOutputFile with column family [default], flush slots available 1, compaction slots available 1, flush slots scheduled 1, compaction slots scheduled 0
+2018/05/25-14:30:07.626725 7f82ba72e700 [db/flush_job.cc:301] [default] [JOB 3] Flushing memtable with next log file: 8
+2018/05/25-14:30:07.626738 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283807626732, "job": 3, "event": "flush_started", "num_memtables": 1, "num_entries": 28018, "num_deletes": 0, "memory_usage": 4065512, "flush_reason": "Write Buffer Full"}
+2018/05/25-14:30:07.626740 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 3] Level-0 flush table #10: started
+2018/05/25-14:30:07.764232 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #11. Immutable memtables: 1.
+2018/05/25-14:30:07.764240 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2
+2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stalling writes because we have 4 level-0 files rate 39886
+2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stopping writes because we have 4 level-0 files rate 39886
+2018/05/25-14:30:09.398302 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283809398276, "cf_name": "default", "job": 3, "event": "table_file_creation", "file_number": 10, "file_size": 1890434, "table_properties": {"data_size": 1876749, "index_size": 23346, "filter_size": 0, "raw_key_size": 663120, "raw_average_key_size": 24, "raw_value_size": 2763000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27630, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:30:09.398351 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 3] Level-0 flush table #10: 1890434 bytes OK
+2018/05/25-14:30:25.491635 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table #23: started
+2018/05/25-14:30:25.643618 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #24. Immutable memtables: 1.
+2018/05/25-14:30:25.643633 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2
+2018/05/25-14:30:27.288181 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283827288158, "cf_name": "default", "job": 10, "event": "table_file_creation", "file_number": 23, "file_size": 1893200, "table_properties": {"data_size": 1879460, "index_size": 23340, "filter_size": 0, "raw_key_size": 663360, "raw_average_key_size": 24, "raw_value_size": 2764000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27640, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:30:27.288210 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 10] Level-0 flush table #23: 1893200 bytes OK
+2018/05/25-14:30:27.289353 7f82ba72e700 [WARN] [db/column_family.cc:764] [default] Stopping writes because of estimated pending compaction bytes 14410584
+2018/05/25-14:30:27.289390 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.288829) [db/memtable_list.cc:377] [default] Level-0 commit table #23 started
+2018/05/25-14:30:27.289393 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.289332) [db/memtable_list.cc:409] [default] Level-0 commit table #23: memtable #1 done
+2018/05/25-14:34:21.047206 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527284061047181, "cf_name": "default", "job": 44, "event": "table_file_creation", "file_number": 84, "file_size": 1890780, "table_properties": {"data_size": 1877100, "index_size": 23309, "filter_size": 0, "raw_key_size": 662808, "raw_average_key_size": 24, "raw_value_size": 2761700, "raw_average_value_size": 100, "num_data_blocks": 837, "num_entries": 27617, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 44] Level-0 flush table #84: 1890780 bytes OK
+2018/05/25-14:34:21.048017 7f82ba72e700 (Original Log Time 2018/05/25-14:34:21.048005) EVENT_LOG_v1 {"time_micros": 1527284061047997, "job": 44, "event": "flush_finished", "output_compression": "Snappy", "lsm_state": [2, 1, 0, 0, 0, 0, 0], "immutable_memtables": 1}
+2018/05/25-14:34:21.048592 7f82bd676200 [DEBUG] [db/db_impl_files.cc:261] [JOB 45] Delete /tmp/rocksdbtest-155919/dbbench/000084.sst type=2 #84 -- OK
+2018/05/25-14:34:21.048603 7f82bd676200 EVENT_LOG_v1 {"time_micros": 1527284061048600, "job": 45, "event": "table_file_deletion", "file_number": 84}
+2018/05/25-14:34:21.048981 7f82bd676200 [db/db_impl.cc:398] Shutdown complete
diff --git a/src/rocksdb/tools/advisor/test/input_files/OPTIONS-000005 b/src/rocksdb/tools/advisor/test/input_files/OPTIONS-000005
new file mode 100644
index 000000000..009edb04d
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/OPTIONS-000005
@@ -0,0 +1,49 @@
+# This is a RocksDB option file.
+#
+# For detailed file format spec, please refer to the example file
+# in examples/rocksdb_option_file_example.ini
+#
+
+[Version]
+ rocksdb_version=5.14.0
+ options_file_version=1.1
+
+[DBOptions]
+ manual_wal_flush=false
+ allow_ingest_behind=false
+ db_write_buffer_size=0
+ db_log_dir=
+ random_access_max_buffer_size=1048576
+
+[CFOptions "default"]
+ ttl=0
+ max_bytes_for_level_base=268435456
+ max_bytes_for_level_multiplier=10.000000
+ level0_file_num_compaction_trigger=4
+ level0_stop_writes_trigger=36
+ write_buffer_size=4194000
+ min_write_buffer_number_to_merge=1
+ num_levels=7
+ compaction_filter_factory=nullptr
+ compaction_style=kCompactionStyleLevel
+
+[TableOptions/BlockBasedTable "default"]
+ block_align=false
+ index_type=kBinarySearch
+
+[CFOptions "col_fam_A"]
+ttl=0
+max_bytes_for_level_base=268435456
+max_bytes_for_level_multiplier=10.000000
+level0_file_num_compaction_trigger=5
+level0_stop_writes_trigger=36
+write_buffer_size=1024000
+min_write_buffer_number_to_merge=1
+num_levels=5
+compaction_filter_factory=nullptr
+compaction_style=kCompactionStyleLevel
+
+[TableOptions/BlockBasedTable "col_fam_A"]
+block_align=true
+block_restart_interval=16
+index_type=kBinarySearch
diff --git a/src/rocksdb/tools/advisor/test/input_files/log_stats_parser_keys_ts b/src/rocksdb/tools/advisor/test/input_files/log_stats_parser_keys_ts
new file mode 100644
index 000000000..e8ade9e3e
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/log_stats_parser_keys_ts
@@ -0,0 +1,3 @@
+rocksdb.number.block.decompressed.count: 1530896335 88.0, 1530896361 788338.0, 1530896387 1539256.0, 1530896414 2255696.0, 1530896440 3009325.0, 1530896466 3767183.0, 1530896492 4529775.0, 1530896518 5297809.0, 1530896545 6033802.0, 1530896570 6794129.0
+rocksdb.db.get.micros.p50: 1530896335 295.5, 1530896361 16.561841, 1530896387 16.20677, 1530896414 16.31508, 1530896440 16.346602, 1530896466 16.284669, 1530896492 16.16005, 1530896518 16.069096, 1530896545 16.028746, 1530896570 15.9638
+rocksdb.manifest.file.sync.micros.p99: 1530896335 649.0, 1530896361 835.0, 1530896387 1435.0, 1530896414 9938.0, 1530896440 9938.0, 1530896466 9938.0, 1530896492 9938.0, 1530896518 1882.0, 1530896545 1837.0, 1530896570 1792.0
diff --git a/src/rocksdb/tools/advisor/test/input_files/rules_err1.ini b/src/rocksdb/tools/advisor/test/input_files/rules_err1.ini
new file mode 100644
index 000000000..23be55dde
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/rules_err1.ini
@@ -0,0 +1,56 @@
+[Rule "missing-suggestions"]
+suggestions=
+conditions=missing-source
+
+[Condition "normal-rule"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
+
+[Rule "missing-conditions"]
+conditions=
+suggestions=missing-description
+
+[Condition "missing-options"]
+source=OPTIONS
+options=
+evaluate=int(options[0])*int(options[1])-int(options[2])<(-251659456) # should evaluate to a boolean
+
+[Rule "missing-expression"]
+conditions=missing-expression
+suggestions=missing-description
+
+[Condition "missing-expression"]
+source=OPTIONS
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base
+evaluate=
+
+[Suggestion "missing-description"]
+description=
+
+[Rule "stop-too-many-L0"]
+suggestions=inc-max-bg-compactions:missing-action:inc-l0-stop-writes-trigger
+conditions=missing-regex
+
+[Condition "missing-regex"]
+source=LOG
+regex=
+
+[Suggestion "missing-option"]
+option=
+action=increase
+
+[Suggestion "normal-suggestion"]
+option=CFOptions.write_buffer_size
+action=increase
+
+[Suggestion "inc-l0-stop-writes-trigger"]
+option=CFOptions.level0_stop_writes_trigger
+action=increase
diff --git a/src/rocksdb/tools/advisor/test/input_files/rules_err2.ini b/src/rocksdb/tools/advisor/test/input_files/rules_err2.ini
new file mode 100644
index 000000000..bce21dba9
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/rules_err2.ini
@@ -0,0 +1,15 @@
+[Rule "normal-rule"]
+suggestions=inc-bg-flush:inc-write-buffer
+conditions=missing-source
+
+[Condition "missing-source"]
+source=
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
diff --git a/src/rocksdb/tools/advisor/test/input_files/rules_err3.ini b/src/rocksdb/tools/advisor/test/input_files/rules_err3.ini
new file mode 100644
index 000000000..73c06e469
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/rules_err3.ini
@@ -0,0 +1,15 @@
+[Rule "normal-rule"]
+suggestions=missing-action:inc-write-buffer
+conditions=missing-source
+
+[Condition "normal-condition"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Suggestion "missing-action"]
+option=DBOptions.max_background_flushes
+action=
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
diff --git a/src/rocksdb/tools/advisor/test/input_files/rules_err4.ini b/src/rocksdb/tools/advisor/test/input_files/rules_err4.ini
new file mode 100644
index 000000000..4d4aa3c70
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/rules_err4.ini
@@ -0,0 +1,15 @@
+[Rule "normal-rule"]
+suggestions=inc-bg-flush
+conditions=missing-source
+
+[Condition "normal-condition"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion] # missing section name
+option=CFOptions.max_write_buffer_number
+action=increase
diff --git a/src/rocksdb/tools/advisor/test/input_files/test_rules.ini b/src/rocksdb/tools/advisor/test/input_files/test_rules.ini
new file mode 100644
index 000000000..97b9374fc
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/test_rules.ini
@@ -0,0 +1,47 @@
+[Rule "single-condition-false"]
+suggestions=inc-bg-flush:inc-write-buffer
+conditions=log-4-false
+
+[Rule "multiple-conds-true"]
+suggestions=inc-write-buffer
+conditions=log-1-true:log-2-true:log-3-true
+
+[Rule "multiple-conds-one-false"]
+suggestions=inc-bg-flush
+conditions=log-1-true:log-4-false:log-3-true
+
+[Rule "multiple-conds-all-false"]
+suggestions=l0-l1-ratio-health-check
+conditions=log-4-false:options-1-false
+
+[Condition "log-1-true"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Condition "log-2-true"]
+source=LOG
+regex=Stalling writes because we have \d+ level-0 files
+
+[Condition "log-3-true"]
+source=LOG
+regex=Stopping writes because we have \d+ level-0 files
+
+[Condition "log-4-false"]
+source=LOG
+regex=Stalling writes because of estimated pending compaction bytes \d+
+
+[Condition "options-1-false"]
+source=OPTIONS
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:DBOptions.random_access_max_buffer_size
+evaluate=int(options[0])*int(options[1])-int(options[2])<0 # should evaluate to a boolean
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
+
+[Suggestion "l0-l1-ratio-health-check"]
+description='modify options such that (level0_file_num_compaction_trigger * write_buffer_size - max_bytes_for_level_base < 5) is satisfied'
diff --git a/src/rocksdb/tools/advisor/test/input_files/triggered_rules.ini b/src/rocksdb/tools/advisor/test/input_files/triggered_rules.ini
new file mode 100644
index 000000000..83b96da2b
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/triggered_rules.ini
@@ -0,0 +1,83 @@
+[Rule "stall-too-many-memtables"]
+suggestions=inc-bg-flush:inc-write-buffer
+conditions=stall-too-many-memtables
+
+[Condition "stall-too-many-memtables"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Rule "stall-too-many-L0"]
+suggestions=inc-max-subcompactions:inc-max-bg-compactions:inc-write-buffer-size:dec-max-bytes-for-level-base:inc-l0-slowdown-writes-trigger
+conditions=stall-too-many-L0
+
+[Condition "stall-too-many-L0"]
+source=LOG
+regex=Stalling writes because we have \d+ level-0 files
+
+[Rule "stop-too-many-L0"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-l0-stop-writes-trigger
+conditions=stop-too-many-L0
+
+[Condition "stop-too-many-L0"]
+source=LOG
+regex=Stopping writes because we have \d+ level-0 files
+
+[Rule "stall-too-many-compaction-bytes"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-hard-pending-compaction-bytes-limit:inc-soft-pending-compaction-bytes-limit
+conditions=stall-too-many-compaction-bytes
+
+[Condition "stall-too-many-compaction-bytes"]
+source=LOG
+regex=Stalling writes because of estimated pending compaction bytes \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
+
+[Suggestion "inc-max-subcompactions"]
+option=DBOptions.max_subcompactions
+action=increase
+
+[Suggestion "inc-max-bg-compactions"]
+option=DBOptions.max_background_compactions
+action=increase
+
+[Suggestion "inc-write-buffer-size"]
+option=CFOptions.write_buffer_size
+action=increase
+
+[Suggestion "dec-max-bytes-for-level-base"]
+option=CFOptions.max_bytes_for_level_base
+action=decrease
+
+[Suggestion "inc-l0-slowdown-writes-trigger"]
+option=CFOptions.level0_slowdown_writes_trigger
+action=increase
+
+[Suggestion "inc-l0-stop-writes-trigger"]
+option=CFOptions.level0_stop_writes_trigger
+action=increase
+
+[Suggestion "inc-hard-pending-compaction-bytes-limit"]
+option=CFOptions.hard_pending_compaction_bytes_limit
+action=increase
+
+[Suggestion "inc-soft-pending-compaction-bytes-limit"]
+option=CFOptions.soft_pending_compaction_bytes_limit
+action=increase
+
+[Rule "level0-level1-ratio"]
+conditions=level0-level1-ratio
+suggestions=l0-l1-ratio-health-check
+
+[Condition "level0-level1-ratio"]
+source=OPTIONS
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base
+evaluate=int(options[0])*int(options[1])-int(options[2])>=-268173312 # should evaluate to a boolean, condition triggered if evaluates to true
+
+[Suggestion "l0-l1-ratio-health-check"]
+description='modify options such that (level0_file_num_compaction_trigger * write_buffer_size - max_bytes_for_level_base < -268173312) is satisfied'
diff --git a/src/rocksdb/tools/advisor/test/test_db_bench_runner.py b/src/rocksdb/tools/advisor/test/test_db_bench_runner.py
new file mode 100644
index 000000000..1c4f77d50
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/test_db_bench_runner.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_bench_runner import DBBenchRunner
+from advisor.db_log_parser import NO_COL_FAMILY, DataSource
+from advisor.db_options_parser import DatabaseOptions
+import os
+import unittest
+
+
+class TestDBBenchRunnerMethods(unittest.TestCase):
+ def setUp(self):
+ self.pos_args = [
+ './../../db_bench',
+ 'overwrite',
+ 'use_existing_db=true',
+ 'duration=10'
+ ]
+ self.bench_runner = DBBenchRunner(self.pos_args)
+ this_path = os.path.abspath(os.path.dirname(__file__))
+ options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
+ self.db_options = DatabaseOptions(options_path)
+
+ def test_setup(self):
+ self.assertEqual(self.bench_runner.db_bench_binary, self.pos_args[0])
+ self.assertEqual(self.bench_runner.benchmark, self.pos_args[1])
+ self.assertSetEqual(
+ set(self.bench_runner.db_bench_args), set(self.pos_args[2:])
+ )
+
+ def test_get_info_log_file_name(self):
+ log_file_name = DBBenchRunner.get_info_log_file_name(
+ None, 'random_path'
+ )
+ self.assertEqual(log_file_name, 'LOG')
+
+ log_file_name = DBBenchRunner.get_info_log_file_name(
+ '/dev/shm/', '/tmp/rocksdbtest-155919/dbbench/'
+ )
+ self.assertEqual(log_file_name, 'tmp_rocksdbtest-155919_dbbench_LOG')
+
+ def test_get_opt_args_str(self):
+ misc_opt_dict = {'bloom_bits': 2, 'empty_opt': None, 'rate_limiter': 3}
+ optional_args_str = DBBenchRunner.get_opt_args_str(misc_opt_dict)
+ self.assertEqual(optional_args_str, ' --bloom_bits=2 --rate_limiter=3')
+
+ def test_get_log_options(self):
+ db_path = '/tmp/rocksdb-155919/dbbench'
+ # when db_log_dir is present in the db_options
+ update_dict = {
+ 'DBOptions.db_log_dir': {NO_COL_FAMILY: '/dev/shm'},
+ 'DBOptions.stats_dump_period_sec': {NO_COL_FAMILY: '20'}
+ }
+ self.db_options.update_options(update_dict)
+ log_file_prefix, stats_freq = self.bench_runner.get_log_options(
+ self.db_options, db_path
+ )
+ self.assertEqual(
+ log_file_prefix, '/dev/shm/tmp_rocksdb-155919_dbbench_LOG'
+ )
+ self.assertEqual(stats_freq, 20)
+
+ update_dict = {
+ 'DBOptions.db_log_dir': {NO_COL_FAMILY: None},
+ 'DBOptions.stats_dump_period_sec': {NO_COL_FAMILY: '30'}
+ }
+ self.db_options.update_options(update_dict)
+ log_file_prefix, stats_freq = self.bench_runner.get_log_options(
+ self.db_options, db_path
+ )
+ self.assertEqual(log_file_prefix, '/tmp/rocksdb-155919/dbbench/LOG')
+ self.assertEqual(stats_freq, 30)
+
+ def test_build_experiment_command(self):
+ # add some misc_options to db_options
+ update_dict = {
+ 'bloom_bits': {NO_COL_FAMILY: 2},
+ 'rate_limiter_bytes_per_sec': {NO_COL_FAMILY: 128000000}
+ }
+ self.db_options.update_options(update_dict)
+ db_path = '/dev/shm'
+ experiment_command = self.bench_runner._build_experiment_command(
+ self.db_options, db_path
+ )
+ opt_args_str = DBBenchRunner.get_opt_args_str(
+ self.db_options.get_misc_options()
+ )
+ opt_args_str += (
+ ' --options_file=' +
+ self.db_options.generate_options_config('12345')
+ )
+ for arg in self.pos_args[2:]:
+ opt_args_str += (' --' + arg)
+ expected_command = (
+ self.pos_args[0] + ' --benchmarks=' + self.pos_args[1] +
+ ' --statistics --perf_level=3 --db=' + db_path + opt_args_str
+ )
+ self.assertEqual(experiment_command, expected_command)
+
+
+class TestDBBenchRunner(unittest.TestCase):
+ def setUp(self):
+ # Note: the db_bench binary should be present in the rocksdb/ directory
+ self.pos_args = [
+ './../../db_bench',
+ 'overwrite',
+ 'use_existing_db=true',
+ 'duration=20'
+ ]
+ self.bench_runner = DBBenchRunner(self.pos_args)
+ this_path = os.path.abspath(os.path.dirname(__file__))
+ options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
+ self.db_options = DatabaseOptions(options_path)
+
+ def test_experiment_output(self):
+ update_dict = {'bloom_bits': {NO_COL_FAMILY: 2}}
+ self.db_options.update_options(update_dict)
+ db_path = '/dev/shm'
+ data_sources, throughput = self.bench_runner.run_experiment(
+ self.db_options, db_path
+ )
+ self.assertEqual(
+ data_sources[DataSource.Type.DB_OPTIONS][0].type,
+ DataSource.Type.DB_OPTIONS
+ )
+ self.assertEqual(
+ data_sources[DataSource.Type.LOG][0].type,
+ DataSource.Type.LOG
+ )
+ self.assertEqual(len(data_sources[DataSource.Type.TIME_SERIES]), 2)
+ self.assertEqual(
+ data_sources[DataSource.Type.TIME_SERIES][0].type,
+ DataSource.Type.TIME_SERIES
+ )
+ self.assertEqual(
+ data_sources[DataSource.Type.TIME_SERIES][1].type,
+ DataSource.Type.TIME_SERIES
+ )
+ self.assertEqual(
+ data_sources[DataSource.Type.TIME_SERIES][1].stats_freq_sec, 0
+ )
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/rocksdb/tools/advisor/test/test_db_log_parser.py b/src/rocksdb/tools/advisor/test/test_db_log_parser.py
new file mode 100644
index 000000000..b70430433
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/test_db_log_parser.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_log_parser import DatabaseLogs, Log, NO_COL_FAMILY
+from advisor.rule_parser import Condition, LogCondition
+import os
+import unittest
+
+
+class TestLog(unittest.TestCase):
+ def setUp(self):
+ self.column_families = ['default', 'col_fam_A']
+
+ def test_get_column_family(self):
+ test_log = (
+ "2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] " +
+ "[col_fam_A] [JOB 44] Level-0 flush table #84: 1890780 bytes OK"
+ )
+ db_log = Log(test_log, self.column_families)
+ self.assertEqual('col_fam_A', db_log.get_column_family())
+
+ test_log = (
+ "2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] " +
+ "[JOB 44] Level-0 flush table #84: 1890780 bytes OK"
+ )
+ db_log = Log(test_log, self.column_families)
+ db_log.append_message('[default] some remaining part of log')
+ self.assertEqual(NO_COL_FAMILY, db_log.get_column_family())
+
+ def test_get_methods(self):
+ hr_time = "2018/05/25-14:30:25.491635"
+ context = "7f82ba72e700"
+ message = (
+ "[db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table " +
+ "#23: started"
+ )
+ test_log = hr_time + " " + context + " " + message
+ db_log = Log(test_log, self.column_families)
+ self.assertEqual(db_log.get_message(), message)
+ remaining_message = "[col_fam_A] some more logs"
+ db_log.append_message(remaining_message)
+ self.assertEqual(
+ db_log.get_human_readable_time(), "2018/05/25-14:30:25.491635"
+ )
+ self.assertEqual(db_log.get_context(), "7f82ba72e700")
+ self.assertEqual(db_log.get_timestamp(), 1527258625)
+ self.assertEqual(
+ db_log.get_message(), str(message + '\n' + remaining_message)
+ )
+
+ def test_is_new_log(self):
+ new_log = "2018/05/25-14:34:21.047233 context random new log"
+ remaining_log = "2018/05/25 not really a new log"
+ self.assertTrue(Log.is_new_log(new_log))
+ self.assertFalse(Log.is_new_log(remaining_log))
+
+
+class TestDatabaseLogs(unittest.TestCase):
+ def test_check_and_trigger_conditions(self):
+ this_path = os.path.abspath(os.path.dirname(__file__))
+ logs_path_prefix = os.path.join(this_path, 'input_files/LOG-0')
+ column_families = ['default', 'col-fam-A', 'col-fam-B']
+ db_logs = DatabaseLogs(logs_path_prefix, column_families)
+ # matches, has 2 col_fams
+ condition1 = LogCondition.create(Condition('cond-A'))
+ condition1.set_parameter('regex', 'random log message')
+ # matches, multiple lines message
+ condition2 = LogCondition.create(Condition('cond-B'))
+ condition2.set_parameter('regex', 'continuing on next line')
+ # does not match
+ condition3 = LogCondition.create(Condition('cond-C'))
+ condition3.set_parameter('regex', 'this should match no log')
+ db_logs.check_and_trigger_conditions(
+ [condition1, condition2, condition3]
+ )
+ cond1_trigger = condition1.get_trigger()
+ self.assertEqual(2, len(cond1_trigger.keys()))
+ self.assertSetEqual(
+ {'col-fam-A', NO_COL_FAMILY}, set(cond1_trigger.keys())
+ )
+ self.assertEqual(2, len(cond1_trigger['col-fam-A']))
+ messages = [
+ "[db/db_impl.cc:563] [col-fam-A] random log message for testing",
+ "[db/db_impl.cc:653] [col-fam-A] another random log message"
+ ]
+ self.assertIn(cond1_trigger['col-fam-A'][0].get_message(), messages)
+ self.assertIn(cond1_trigger['col-fam-A'][1].get_message(), messages)
+ self.assertEqual(1, len(cond1_trigger[NO_COL_FAMILY]))
+ self.assertEqual(
+ cond1_trigger[NO_COL_FAMILY][0].get_message(),
+ "[db/db_impl.cc:331] [unknown] random log message no column family"
+ )
+ cond2_trigger = condition2.get_trigger()
+ self.assertEqual(['col-fam-B'], list(cond2_trigger.keys()))
+ self.assertEqual(1, len(cond2_trigger['col-fam-B']))
+ self.assertEqual(
+ cond2_trigger['col-fam-B'][0].get_message(),
+ "[db/db_impl.cc:234] [col-fam-B] log continuing on next line\n" +
+ "remaining part of the log"
+ )
+ self.assertIsNone(condition3.get_trigger())
diff --git a/src/rocksdb/tools/advisor/test/test_db_options_parser.py b/src/rocksdb/tools/advisor/test/test_db_options_parser.py
new file mode 100644
index 000000000..d53a9bdb5
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/test_db_options_parser.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_log_parser import NO_COL_FAMILY
+from advisor.db_options_parser import DatabaseOptions
+from advisor.rule_parser import Condition, OptionCondition
+import os
+import unittest
+
+
+class TestDatabaseOptions(unittest.TestCase):
+ def setUp(self):
+ self.this_path = os.path.abspath(os.path.dirname(__file__))
+ self.og_options = os.path.join(
+ self.this_path, 'input_files/OPTIONS-000005'
+ )
+ misc_options = [
+ 'bloom_bits = 4', 'rate_limiter_bytes_per_sec = 1024000'
+ ]
+ # create the options object
+ self.db_options = DatabaseOptions(self.og_options, misc_options)
+ # perform clean-up before running tests
+ self.generated_options = os.path.join(
+ self.this_path, '../temp/OPTIONS_testing.tmp'
+ )
+ if os.path.isfile(self.generated_options):
+ os.remove(self.generated_options)
+
+ def test_get_options_diff(self):
+ old_opt = {
+ 'DBOptions.stats_dump_freq_sec': {NO_COL_FAMILY: '20'},
+ 'CFOptions.write_buffer_size': {
+ 'default': '1024000',
+ 'col_fam_A': '128000',
+ 'col_fam_B': '128000000'
+ },
+ 'DBOptions.use_fsync': {NO_COL_FAMILY: 'true'},
+ 'DBOptions.max_log_file_size': {NO_COL_FAMILY: '128000000'}
+ }
+ new_opt = {
+ 'bloom_bits': {NO_COL_FAMILY: '4'},
+ 'CFOptions.write_buffer_size': {
+ 'default': '128000000',
+ 'col_fam_A': '128000',
+ 'col_fam_C': '128000000'
+ },
+ 'DBOptions.use_fsync': {NO_COL_FAMILY: 'true'},
+ 'DBOptions.max_log_file_size': {NO_COL_FAMILY: '0'}
+ }
+ diff = DatabaseOptions.get_options_diff(old_opt, new_opt)
+
+ expected_diff = {
+ 'DBOptions.stats_dump_freq_sec': {NO_COL_FAMILY: ('20', None)},
+ 'bloom_bits': {NO_COL_FAMILY: (None, '4')},
+ 'CFOptions.write_buffer_size': {
+ 'default': ('1024000', '128000000'),
+ 'col_fam_B': ('128000000', None),
+ 'col_fam_C': (None, '128000000')
+ },
+ 'DBOptions.max_log_file_size': {NO_COL_FAMILY: ('128000000', '0')}
+ }
+ self.assertDictEqual(diff, expected_diff)
+
+ def test_is_misc_option(self):
+ self.assertTrue(DatabaseOptions.is_misc_option('bloom_bits'))
+ self.assertFalse(
+ DatabaseOptions.is_misc_option('DBOptions.stats_dump_freq_sec')
+ )
+
+ def test_set_up(self):
+ options = self.db_options.get_all_options()
+ self.assertEqual(22, len(options.keys()))
+ expected_misc_options = {
+ 'bloom_bits': '4', 'rate_limiter_bytes_per_sec': '1024000'
+ }
+ self.assertDictEqual(
+ expected_misc_options, self.db_options.get_misc_options()
+ )
+ self.assertListEqual(
+ ['default', 'col_fam_A'], self.db_options.get_column_families()
+ )
+
+ def test_get_options(self):
+ opt_to_get = [
+ 'DBOptions.manual_wal_flush', 'DBOptions.db_write_buffer_size',
+ 'bloom_bits', 'CFOptions.compaction_filter_factory',
+ 'CFOptions.num_levels', 'rate_limiter_bytes_per_sec',
+ 'TableOptions.BlockBasedTable.block_align', 'random_option'
+ ]
+ options = self.db_options.get_options(opt_to_get)
+ expected_options = {
+ 'DBOptions.manual_wal_flush': {NO_COL_FAMILY: 'false'},
+ 'DBOptions.db_write_buffer_size': {NO_COL_FAMILY: '0'},
+ 'bloom_bits': {NO_COL_FAMILY: '4'},
+ 'CFOptions.compaction_filter_factory': {
+ 'default': 'nullptr', 'col_fam_A': 'nullptr'
+ },
+ 'CFOptions.num_levels': {'default': '7', 'col_fam_A': '5'},
+ 'rate_limiter_bytes_per_sec': {NO_COL_FAMILY: '1024000'},
+ 'TableOptions.BlockBasedTable.block_align': {
+ 'default': 'false', 'col_fam_A': 'true'
+ }
+ }
+ self.assertDictEqual(expected_options, options)
+
+ def test_update_options(self):
+ # add new, update old, set old
+ # before updating
+ expected_old_opts = {
+ 'DBOptions.db_log_dir': {NO_COL_FAMILY: None},
+ 'DBOptions.manual_wal_flush': {NO_COL_FAMILY: 'false'},
+ 'bloom_bits': {NO_COL_FAMILY: '4'},
+ 'CFOptions.num_levels': {'default': '7', 'col_fam_A': '5'},
+ 'TableOptions.BlockBasedTable.block_restart_interval': {
+ 'col_fam_A': '16'
+ }
+ }
+ get_opts = list(expected_old_opts.keys())
+ options = self.db_options.get_options(get_opts)
+ self.assertEqual(expected_old_opts, options)
+ # after updating options
+ update_opts = {
+ 'DBOptions.db_log_dir': {NO_COL_FAMILY: '/dev/shm'},
+ 'DBOptions.manual_wal_flush': {NO_COL_FAMILY: 'true'},
+ 'bloom_bits': {NO_COL_FAMILY: '2'},
+ 'CFOptions.num_levels': {'col_fam_A': '7'},
+ 'TableOptions.BlockBasedTable.block_restart_interval': {
+ 'default': '32'
+ },
+ 'random_misc_option': {NO_COL_FAMILY: 'something'}
+ }
+ self.db_options.update_options(update_opts)
+ update_opts['CFOptions.num_levels']['default'] = '7'
+ update_opts['TableOptions.BlockBasedTable.block_restart_interval'] = {
+ 'default': '32', 'col_fam_A': '16'
+ }
+ get_opts.append('random_misc_option')
+ options = self.db_options.get_options(get_opts)
+ self.assertDictEqual(update_opts, options)
+ expected_misc_options = {
+ 'bloom_bits': '2',
+ 'rate_limiter_bytes_per_sec': '1024000',
+ 'random_misc_option': 'something'
+ }
+ self.assertDictEqual(
+ expected_misc_options, self.db_options.get_misc_options()
+ )
+
+ def test_generate_options_config(self):
+ # make sure file does not exist from before
+ self.assertFalse(os.path.isfile(self.generated_options))
+ self.db_options.generate_options_config('testing')
+ self.assertTrue(os.path.isfile(self.generated_options))
+
+ def test_check_and_trigger_conditions(self):
+ # options only from CFOptions
+ # setup the OptionCondition objects to check and trigger
+ update_dict = {
+ 'CFOptions.level0_file_num_compaction_trigger': {'col_fam_A': '4'},
+ 'CFOptions.max_bytes_for_level_base': {'col_fam_A': '10'}
+ }
+ self.db_options.update_options(update_dict)
+ cond1 = Condition('opt-cond-1')
+ cond1 = OptionCondition.create(cond1)
+ cond1.set_parameter(
+ 'options', [
+ 'CFOptions.level0_file_num_compaction_trigger',
+ 'TableOptions.BlockBasedTable.block_restart_interval',
+ 'CFOptions.max_bytes_for_level_base'
+ ]
+ )
+ cond1.set_parameter(
+ 'evaluate',
+ 'int(options[0])*int(options[1])-int(options[2])>=0'
+ )
+ # only DBOptions
+ cond2 = Condition('opt-cond-2')
+ cond2 = OptionCondition.create(cond2)
+ cond2.set_parameter(
+ 'options', [
+ 'DBOptions.db_write_buffer_size',
+ 'bloom_bits',
+ 'rate_limiter_bytes_per_sec'
+ ]
+ )
+ cond2.set_parameter(
+ 'evaluate',
+ '(int(options[2]) * int(options[1]) * int(options[0]))==0'
+ )
+ # mix of CFOptions and DBOptions
+ cond3 = Condition('opt-cond-3')
+ cond3 = OptionCondition.create(cond3)
+ cond3.set_parameter(
+ 'options', [
+ 'DBOptions.db_write_buffer_size', # 0
+ 'CFOptions.num_levels', # 5, 7
+ 'bloom_bits' # 4
+ ]
+ )
+ cond3.set_parameter(
+ 'evaluate', 'int(options[2])*int(options[0])+int(options[1])>6'
+ )
+ self.db_options.check_and_trigger_conditions([cond1, cond2, cond3])
+
+ cond1_trigger = {'col_fam_A': ['4', '16', '10']}
+ self.assertDictEqual(cond1_trigger, cond1.get_trigger())
+ cond2_trigger = {NO_COL_FAMILY: ['0', '4', '1024000']}
+ self.assertDictEqual(cond2_trigger, cond2.get_trigger())
+ cond3_trigger = {'default': ['0', '7', '4']}
+ self.assertDictEqual(cond3_trigger, cond3.get_trigger())
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/rocksdb/tools/advisor/test/test_db_stats_fetcher.py b/src/rocksdb/tools/advisor/test/test_db_stats_fetcher.py
new file mode 100644
index 000000000..afbbe8339
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/test_db_stats_fetcher.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_stats_fetcher import LogStatsParser, DatabasePerfContext
+from advisor.db_timeseries_parser import NO_ENTITY
+from advisor.rule_parser import Condition, TimeSeriesCondition
+import os
+import time
+import unittest
+from unittest.mock import MagicMock
+
+
+class TestLogStatsParser(unittest.TestCase):
+ def setUp(self):
+ this_path = os.path.abspath(os.path.dirname(__file__))
+ stats_file = os.path.join(
+ this_path, 'input_files/log_stats_parser_keys_ts'
+ )
+ # populate the keys_ts dictionary of LogStatsParser
+ self.stats_dict = {NO_ENTITY: {}}
+ with open(stats_file, 'r') as fp:
+ for line in fp:
+ stat_name = line.split(':')[0].strip()
+ self.stats_dict[NO_ENTITY][stat_name] = {}
+ token_list = line.split(':')[1].strip().split(',')
+ for token in token_list:
+ timestamp = int(token.split()[0])
+ value = float(token.split()[1])
+ self.stats_dict[NO_ENTITY][stat_name][timestamp] = value
+ self.log_stats_parser = LogStatsParser('dummy_log_file', 20)
+ self.log_stats_parser.keys_ts = self.stats_dict
+
+ def test_check_and_trigger_conditions_bursty(self):
+ # mock fetch_timeseries() because 'keys_ts' has been pre-populated
+ self.log_stats_parser.fetch_timeseries = MagicMock()
+ # condition: bursty
+ cond1 = Condition('cond-1')
+ cond1 = TimeSeriesCondition.create(cond1)
+ cond1.set_parameter('keys', 'rocksdb.db.get.micros.p50')
+ cond1.set_parameter('behavior', 'bursty')
+ cond1.set_parameter('window_sec', 40)
+ cond1.set_parameter('rate_threshold', 0)
+ self.log_stats_parser.check_and_trigger_conditions([cond1])
+ expected_cond_trigger = {
+ NO_ENTITY: {1530896440: 0.9767546362322214}
+ }
+ self.assertDictEqual(expected_cond_trigger, cond1.get_trigger())
+ # ensure that fetch_timeseries() was called once
+ self.log_stats_parser.fetch_timeseries.assert_called_once()
+
+ def test_check_and_trigger_conditions_eval_agg(self):
+ # mock fetch_timeseries() because 'keys_ts' has been pre-populated
+ self.log_stats_parser.fetch_timeseries = MagicMock()
+ # condition: evaluate_expression
+ cond1 = Condition('cond-1')
+ cond1 = TimeSeriesCondition.create(cond1)
+ cond1.set_parameter('keys', 'rocksdb.db.get.micros.p50')
+ cond1.set_parameter('behavior', 'evaluate_expression')
+ keys = [
+ 'rocksdb.manifest.file.sync.micros.p99',
+ 'rocksdb.db.get.micros.p50'
+ ]
+ cond1.set_parameter('keys', keys)
+ cond1.set_parameter('aggregation_op', 'latest')
+ # condition evaluates to FALSE
+ cond1.set_parameter('evaluate', 'keys[0]-(keys[1]*100)>200')
+ self.log_stats_parser.check_and_trigger_conditions([cond1])
+ expected_cond_trigger = {NO_ENTITY: [1792.0, 15.9638]}
+ self.assertIsNone(cond1.get_trigger())
+ # condition evaluates to TRUE
+ cond1.set_parameter('evaluate', 'keys[0]-(keys[1]*100)<200')
+ self.log_stats_parser.check_and_trigger_conditions([cond1])
+ expected_cond_trigger = {NO_ENTITY: [1792.0, 15.9638]}
+ self.assertDictEqual(expected_cond_trigger, cond1.get_trigger())
+ # ensure that fetch_timeseries() was called
+ self.log_stats_parser.fetch_timeseries.assert_called()
+
+ def test_check_and_trigger_conditions_eval(self):
+ # mock fetch_timeseries() because 'keys_ts' has been pre-populated
+ self.log_stats_parser.fetch_timeseries = MagicMock()
+ # condition: evaluate_expression
+ cond1 = Condition('cond-1')
+ cond1 = TimeSeriesCondition.create(cond1)
+ cond1.set_parameter('keys', 'rocksdb.db.get.micros.p50')
+ cond1.set_parameter('behavior', 'evaluate_expression')
+ keys = [
+ 'rocksdb.manifest.file.sync.micros.p99',
+ 'rocksdb.db.get.micros.p50'
+ ]
+ cond1.set_parameter('keys', keys)
+ cond1.set_parameter('evaluate', 'keys[0]-(keys[1]*100)>500')
+ self.log_stats_parser.check_and_trigger_conditions([cond1])
+ expected_trigger = {NO_ENTITY: {
+ 1530896414: [9938.0, 16.31508],
+ 1530896440: [9938.0, 16.346602],
+ 1530896466: [9938.0, 16.284669],
+ 1530896492: [9938.0, 16.16005]
+ }}
+ self.assertDictEqual(expected_trigger, cond1.get_trigger())
+ self.log_stats_parser.fetch_timeseries.assert_called_once()
+
+
+class TestDatabasePerfContext(unittest.TestCase):
+ def test_unaccumulate_metrics(self):
+ perf_dict = {
+ "user_key_comparison_count": 675903942,
+ "block_cache_hit_count": 830086,
+ }
+ timestamp = int(time.time())
+ perf_ts = {}
+ for key in perf_dict:
+ perf_ts[key] = {}
+ start_val = perf_dict[key]
+ for ix in range(5):
+ perf_ts[key][timestamp+(ix*10)] = start_val + (2 * ix * ix)
+ db_perf_context = DatabasePerfContext(perf_ts, 10, True)
+ timestamps = [timestamp+(ix*10) for ix in range(1, 5, 1)]
+ values = [val for val in range(2, 15, 4)]
+ inner_dict = {timestamps[ix]: values[ix] for ix in range(4)}
+ expected_keys_ts = {NO_ENTITY: {
+ 'user_key_comparison_count': inner_dict,
+ 'block_cache_hit_count': inner_dict
+ }}
+ self.assertDictEqual(expected_keys_ts, db_perf_context.keys_ts)
diff --git a/src/rocksdb/tools/advisor/test/test_rule_parser.py b/src/rocksdb/tools/advisor/test/test_rule_parser.py
new file mode 100644
index 000000000..9f1d0bf5c
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/test_rule_parser.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+import os
+import unittest
+from advisor.rule_parser import RulesSpec
+from advisor.db_log_parser import DatabaseLogs, DataSource
+from advisor.db_options_parser import DatabaseOptions
+
+RuleToSuggestions = {
+ "stall-too-many-memtables": [
+ 'inc-bg-flush',
+ 'inc-write-buffer'
+ ],
+ "stall-too-many-L0": [
+ 'inc-max-subcompactions',
+ 'inc-max-bg-compactions',
+ 'inc-write-buffer-size',
+ 'dec-max-bytes-for-level-base',
+ 'inc-l0-slowdown-writes-trigger'
+ ],
+ "stop-too-many-L0": [
+ 'inc-max-bg-compactions',
+ 'inc-write-buffer-size',
+ 'inc-l0-stop-writes-trigger'
+ ],
+ "stall-too-many-compaction-bytes": [
+ 'inc-max-bg-compactions',
+ 'inc-write-buffer-size',
+ 'inc-hard-pending-compaction-bytes-limit',
+ 'inc-soft-pending-compaction-bytes-limit'
+ ],
+ "level0-level1-ratio": [
+ 'l0-l1-ratio-health-check'
+ ]
+}
+
+
+class TestAllRulesTriggered(unittest.TestCase):
+ def setUp(self):
+ # load the Rules
+ this_path = os.path.abspath(os.path.dirname(__file__))
+ ini_path = os.path.join(this_path, 'input_files/triggered_rules.ini')
+ self.db_rules = RulesSpec(ini_path)
+ self.db_rules.load_rules_from_spec()
+ self.db_rules.perform_section_checks()
+ # load the data sources: LOG and OPTIONS
+ log_path = os.path.join(this_path, 'input_files/LOG-0')
+ options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
+ db_options_parser = DatabaseOptions(options_path)
+ self.column_families = db_options_parser.get_column_families()
+ db_logs_parser = DatabaseLogs(log_path, self.column_families)
+ self.data_sources = {
+ DataSource.Type.DB_OPTIONS: [db_options_parser],
+ DataSource.Type.LOG: [db_logs_parser]
+ }
+
+ def test_triggered_conditions(self):
+ conditions_dict = self.db_rules.get_conditions_dict()
+ rules_dict = self.db_rules.get_rules_dict()
+ # Make sure none of the conditions is triggered beforehand
+ for cond in conditions_dict.values():
+ self.assertFalse(cond.is_triggered(), repr(cond))
+ for rule in rules_dict.values():
+ self.assertFalse(
+ rule.is_triggered(conditions_dict, self.column_families),
+ repr(rule)
+ )
+
+ # # Trigger the conditions as per the data sources.
+ # trigger_conditions(, conditions_dict)
+
+ # Get the set of rules that have been triggered
+ triggered_rules = self.db_rules.get_triggered_rules(
+ self.data_sources, self.column_families
+ )
+
+ # Make sure each condition and rule is triggered
+ for cond in conditions_dict.values():
+ if cond.get_data_source() is DataSource.Type.TIME_SERIES:
+ continue
+ self.assertTrue(cond.is_triggered(), repr(cond))
+
+ for rule in rules_dict.values():
+ self.assertIn(rule, triggered_rules)
+ # Check the suggestions made by the triggered rules
+ for sugg in rule.get_suggestions():
+ self.assertIn(sugg, RuleToSuggestions[rule.name])
+
+ for rule in triggered_rules:
+ self.assertIn(rule, rules_dict.values())
+ for sugg in RuleToSuggestions[rule.name]:
+ self.assertIn(sugg, rule.get_suggestions())
+
+
+class TestConditionsConjunctions(unittest.TestCase):
+ def setUp(self):
+ # load the Rules
+ this_path = os.path.abspath(os.path.dirname(__file__))
+ ini_path = os.path.join(this_path, 'input_files/test_rules.ini')
+ self.db_rules = RulesSpec(ini_path)
+ self.db_rules.load_rules_from_spec()
+ self.db_rules.perform_section_checks()
+ # load the data sources: LOG and OPTIONS
+ log_path = os.path.join(this_path, 'input_files/LOG-1')
+ options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
+ db_options_parser = DatabaseOptions(options_path)
+ self.column_families = db_options_parser.get_column_families()
+ db_logs_parser = DatabaseLogs(log_path, self.column_families)
+ self.data_sources = {
+ DataSource.Type.DB_OPTIONS: [db_options_parser],
+ DataSource.Type.LOG: [db_logs_parser]
+ }
+
+ def test_condition_conjunctions(self):
+ conditions_dict = self.db_rules.get_conditions_dict()
+ rules_dict = self.db_rules.get_rules_dict()
+ # Make sure none of the conditions is triggered beforehand
+ for cond in conditions_dict.values():
+ self.assertFalse(cond.is_triggered(), repr(cond))
+ for rule in rules_dict.values():
+ self.assertFalse(
+ rule.is_triggered(conditions_dict, self.column_families),
+ repr(rule)
+ )
+
+ # Trigger the conditions as per the data sources.
+ self.db_rules.trigger_conditions(self.data_sources)
+
+ # Check for the conditions
+ conds_triggered = ['log-1-true', 'log-2-true', 'log-3-true']
+ conds_not_triggered = ['log-4-false', 'options-1-false']
+ for cond in conds_triggered:
+ self.assertTrue(conditions_dict[cond].is_triggered(), repr(cond))
+ for cond in conds_not_triggered:
+ self.assertFalse(conditions_dict[cond].is_triggered(), repr(cond))
+
+ # Check for the rules
+ rules_triggered = ['multiple-conds-true']
+ rules_not_triggered = [
+ 'single-condition-false',
+ 'multiple-conds-one-false',
+ 'multiple-conds-all-false'
+ ]
+ for rule_name in rules_triggered:
+ rule = rules_dict[rule_name]
+ self.assertTrue(
+ rule.is_triggered(conditions_dict, self.column_families),
+ repr(rule)
+ )
+ for rule_name in rules_not_triggered:
+ rule = rules_dict[rule_name]
+ self.assertFalse(
+ rule.is_triggered(conditions_dict, self.column_families),
+ repr(rule)
+ )
+
+
+class TestSanityChecker(unittest.TestCase):
+ def setUp(self):
+ this_path = os.path.abspath(os.path.dirname(__file__))
+ ini_path = os.path.join(this_path, 'input_files/rules_err1.ini')
+ db_rules = RulesSpec(ini_path)
+ db_rules.load_rules_from_spec()
+ self.rules_dict = db_rules.get_rules_dict()
+ self.conditions_dict = db_rules.get_conditions_dict()
+ self.suggestions_dict = db_rules.get_suggestions_dict()
+
+ def test_rule_missing_suggestions(self):
+ regex = '.*rule must have at least one suggestion.*'
+ with self.assertRaisesRegex(ValueError, regex):
+ self.rules_dict['missing-suggestions'].perform_checks()
+
+ def test_rule_missing_conditions(self):
+ regex = '.*rule must have at least one condition.*'
+ with self.assertRaisesRegex(ValueError, regex):
+ self.rules_dict['missing-conditions'].perform_checks()
+
+ def test_condition_missing_regex(self):
+ regex = '.*provide regex for log condition.*'
+ with self.assertRaisesRegex(ValueError, regex):
+ self.conditions_dict['missing-regex'].perform_checks()
+
+ def test_condition_missing_options(self):
+ regex = '.*options missing in condition.*'
+ with self.assertRaisesRegex(ValueError, regex):
+ self.conditions_dict['missing-options'].perform_checks()
+
+ def test_condition_missing_expression(self):
+ regex = '.*expression missing in condition.*'
+ with self.assertRaisesRegex(ValueError, regex):
+ self.conditions_dict['missing-expression'].perform_checks()
+
+ def test_suggestion_missing_option(self):
+ regex = '.*provide option or description.*'
+ with self.assertRaisesRegex(ValueError, regex):
+ self.suggestions_dict['missing-option'].perform_checks()
+
+ def test_suggestion_missing_description(self):
+ regex = '.*provide option or description.*'
+ with self.assertRaisesRegex(ValueError, regex):
+ self.suggestions_dict['missing-description'].perform_checks()
+
+
+class TestParsingErrors(unittest.TestCase):
+ def setUp(self):
+ self.this_path = os.path.abspath(os.path.dirname(__file__))
+
+ def test_condition_missing_source(self):
+ ini_path = os.path.join(self.this_path, 'input_files/rules_err2.ini')
+ db_rules = RulesSpec(ini_path)
+ regex = '.*provide source for condition.*'
+ with self.assertRaisesRegex(NotImplementedError, regex):
+ db_rules.load_rules_from_spec()
+
+ def test_suggestion_missing_action(self):
+ ini_path = os.path.join(self.this_path, 'input_files/rules_err3.ini')
+ db_rules = RulesSpec(ini_path)
+ regex = '.*provide action for option.*'
+ with self.assertRaisesRegex(ValueError, regex):
+ db_rules.load_rules_from_spec()
+
+ def test_section_no_name(self):
+ ini_path = os.path.join(self.this_path, 'input_files/rules_err4.ini')
+ db_rules = RulesSpec(ini_path)
+ regex = 'Parsing error: needed section header:.*'
+ with self.assertRaisesRegex(ValueError, regex):
+ db_rules.load_rules_from_spec()
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/rocksdb/tools/analyze_txn_stress_test.sh b/src/rocksdb/tools/analyze_txn_stress_test.sh
new file mode 100755
index 000000000..477b1fac5
--- /dev/null
+++ b/src/rocksdb/tools/analyze_txn_stress_test.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Usage:
+# 1. Enable ROCKS_LOG_DETAILS in util/logging.h
+# 2. Run ./transaction_test --gtest_filter="MySQLStyleTransactionTest/MySQLStyleTransactionTest.TransactionStressTest/*" --gtest_break_on_failure
+# 3. SET=1 # 2 or 3
+# 4. LOG=/dev/shm/transaction_testdb_8600601584148590297/LOG
+# 5. grep RandomTransactionVerify $LOG | cut -d' ' -f 12 | sort -n # to find verify snapshots
+# 5. vn=1345
+# 6. vn_1=1340
+# 4. . tools/tools/analyze_txn_stress_test.sh
+echo Input params:
+# The rocksdb LOG path
+echo $LOG
+# Snapshot at which we got RandomTransactionVerify failure
+echo $vn
+# The snapshot before that where RandomTransactionVerify passed
+echo $vn_1
+# The stress tests use 3 sets, one or more might have shown inconsistent results.
+SET=${SET-1} # 1 or 2 or 3
+echo Checking set number $SET
+
+# Find the txns that committed between the two snapshots, and gather their changes made by them in /tmp/changes.txt
+# 2019/02/28-15:25:51.655477 7fffec9ff700 [DEBUG] [ilities/transactions/write_prepared_txn_db.cc:416] Txn 68497 Committing with 68498
+grep Committing $LOG | awk '{if ($9 <= vn && $9 > vn_1) print $0}' vn=$vn vn_1=${vn_1} > /tmp/txn.txt
+# 2019/02/28-15:25:49.046464 7fffe81f5700 [DEBUG] [il/transaction_test_util.cc:216] Commit of 65541 OK (txn12936193128775589751-9089)
+for i in `cat /tmp/txn.txt | awk '{print $6}'`; do grep "Commit of $i " $LOG; done > /tmp/names.txt
+for n in `cat /tmp/names.txt | awk '{print $9}'`; do grep $n $LOG; done > /tmp/changes.txt
+echo "Sum of the changes:"
+cat /tmp/changes.txt | grep Insert | awk '{print $12}' | cut -d= -f1 | cut -d+ -f2 | awk '{sum+=$1} END{print sum}'
+
+# Gather read values at each snapshot
+# 2019/02/28-15:25:51.655926 7fffebbff700 [DEBUG] [il/transaction_test_util.cc:347] VerifyRead at 67972 (67693): 000230 value: 15983
+grep "VerifyRead at ${vn_1} (.*): 000${SET}" $LOG | cut -d' ' -f 9- > /tmp/va.txt
+grep "VerifyRead at ${vn} (.*): 000${SET}" $LOG | cut -d' ' -f 9- > /tmp/vb.txt
+
+# For each key in the 2nd snapshot, find the value read by 1st, do the adds, and see if the results match.
+IFS=$'\n'
+for l in `cat /tmp/vb.txt`;
+do
+ grep $l /tmp/va.txt > /dev/null ;
+ if [[ $? -ne 0 ]]; then
+ #echo $l
+ k=`echo $l | awk '{print $1}'`;
+ v=`echo $l | awk '{print $3}'`;
+ # 2019/02/28-15:25:19.350111 7fffe81f5700 [DEBUG] [il/transaction_test_util.cc:194] Insert (txn12936193128775589751-2298) OK snap: 16289 key:000219 value: 3772+95=3867
+ exp=`grep "\<$k\>" /tmp/changes.txt | tail -1 | cut -d= -f2`;
+ if [[ $v -ne $exp ]]; then echo $l; fi
+ else
+ k=`echo $l | awk '{print $1}'`;
+ grep "\<$k\>" /tmp/changes.txt
+ fi;
+done
+
+# Check that all the keys read in the 1st snapshot are still visible in the 2nd
+for l in `cat /tmp/va.txt`;
+do
+ k=`echo $l | awk '{print $1}'`;
+ grep "\<$k\>" /tmp/vb.txt > /dev/null
+ if [[ $? -ne 0 ]]; then
+ echo missing key $k
+ fi
+done
+
+# The following found a bug in ValidateSnapshot. It checks if the adds on each key match up.
+grep Insert /tmp/changes.txt | cut -d' ' -f 10 | sort | uniq > /tmp/keys.txt
+for k in `cat /tmp/keys.txt`;
+do
+ grep "\<$k\>" /tmp/changes.txt > /tmp/adds.txt;
+ # 2019/02/28-15:25:19.350111 7fffe81f5700 [DEBUG] [il/transaction_test_util.cc:194] Insert (txn12936193128775589751-2298) OK snap: 16289 key:000219 value: 3772+95=3867
+ START=`head -1 /tmp/adds.txt | cut -d' ' -f 12 | cut -d+ -f1`
+ END=`tail -1 /tmp/adds.txt | cut -d' ' -f 12 | cut -d= -f2`
+ ADDS=`cat /tmp/adds.txt | grep Insert | awk '{print $12}' | cut -d= -f1 | cut -d+ -f2 | awk '{sum+=$1} END{print sum}'`
+ EXP=$((START+ADDS))
+ # If first + all the adds != last then there was an issue with ValidateSnapshot.
+ if [[ $END -ne $EXP ]]; then echo inconsistent txn: $k $START+$ADDS=$END; cat /tmp/adds.txt; return 1; fi
+done
diff --git a/src/rocksdb/tools/auto_sanity_test.sh b/src/rocksdb/tools/auto_sanity_test.sh
new file mode 100755
index 000000000..4670ef9bb
--- /dev/null
+++ b/src/rocksdb/tools/auto_sanity_test.sh
@@ -0,0 +1,93 @@
+# shellcheck disable=SC2148
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+TMP_DIR="${TMPDIR:-/tmp}/rocksdb-sanity-test"
+
+if [ "$#" -lt 2 ]; then
+ echo "usage: ./auto_sanity_test.sh [new_commit] [old_commit]"
+ echo "Missing either [new_commit] or [old_commit], perform sanity check with the latest and 10th latest commits."
+ recent_commits=`git log | grep -e "^commit [a-z0-9]\+$"| head -n10 | sed -e 's/commit //g'`
+ commit_new=`echo "$recent_commits" | head -n1`
+ commit_old=`echo "$recent_commits" | tail -n1`
+ echo "the most recent commits are:"
+ echo "$recent_commits"
+else
+ commit_new=$1
+ commit_old=$2
+fi
+
+if [ ! -d $TMP_DIR ]; then
+ mkdir $TMP_DIR
+fi
+dir_new="${TMP_DIR}/${commit_new}"
+dir_old="${TMP_DIR}/${commit_old}"
+
+function makestuff() {
+ echo "make clean"
+ make clean > /dev/null
+ echo "make db_sanity_test -j32"
+ make db_sanity_test -j32 > /dev/null
+ if [ $? -ne 0 ]; then
+ echo "[ERROR] Failed to perform 'make db_sanity_test'"
+ exit 1
+ fi
+}
+
+rm -r -f $dir_new
+rm -r -f $dir_old
+
+echo "Running db sanity check with commits $commit_new and $commit_old."
+
+echo "============================================================="
+echo "Making build $commit_new"
+git checkout $commit_new
+if [ $? -ne 0 ]; then
+ echo "[ERROR] Can't checkout $commit_new"
+ exit 1
+fi
+makestuff
+mv db_sanity_test new_db_sanity_test
+echo "Creating db based on the new commit --- $commit_new"
+./new_db_sanity_test $dir_new create
+cp ./tools/db_sanity_test.cc $dir_new
+cp ./tools/auto_sanity_test.sh $dir_new
+
+echo "============================================================="
+echo "Making build $commit_old"
+git checkout $commit_old
+if [ $? -ne 0 ]; then
+ echo "[ERROR] Can't checkout $commit_old"
+ exit 1
+fi
+cp -f $dir_new/db_sanity_test.cc ./tools/.
+cp -f $dir_new/auto_sanity_test.sh ./tools/.
+makestuff
+mv db_sanity_test old_db_sanity_test
+echo "Creating db based on the old commit --- $commit_old"
+./old_db_sanity_test $dir_old create
+
+echo "============================================================="
+echo "[Backward Compatibility Check]"
+echo "Verifying old db $dir_old using the new commit --- $commit_new"
+./new_db_sanity_test $dir_old verify
+if [ $? -ne 0 ]; then
+ echo "[ERROR] Backward Compatibility Check fails:"
+ echo " Verification of $dir_old using commit $commit_new failed."
+ exit 2
+fi
+
+echo "============================================================="
+echo "[Forward Compatibility Check]"
+echo "Verifying new db $dir_new using the old commit --- $commit_old"
+./old_db_sanity_test $dir_new verify
+if [ $? -ne 0 ]; then
+ echo "[ERROR] Forward Compatibility Check fails:"
+ echo " $dir_new using commit $commit_old failed."
+ exit 2
+fi
+
+rm old_db_sanity_test
+rm new_db_sanity_test
+rm -rf $dir_new
+rm -rf $dir_old
+
+echo "Auto sanity test passed!"
diff --git a/src/rocksdb/tools/benchmark.sh b/src/rocksdb/tools/benchmark.sh
new file mode 100755
index 000000000..f27926ed5
--- /dev/null
+++ b/src/rocksdb/tools/benchmark.sh
@@ -0,0 +1,525 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# REQUIRE: db_bench binary exists in the current directory
+
+if [ $# -ne 1 ]; then
+ echo -n "./benchmark.sh [bulkload/fillseq/overwrite/filluniquerandom/"
+ echo "readrandom/readwhilewriting/readwhilemerging/updaterandom/"
+ echo "mergerandom/randomtransaction/compact]"
+ exit 0
+fi
+
+# Make it easier to run only the compaction test. Getting valid data requires
+# a number of iterations and having an ability to run the test separately from
+# rest of the benchmarks helps.
+if [ "$COMPACTION_TEST" == "1" -a "$1" != "universal_compaction" ]; then
+ echo "Skipping $1 because it's not a compaction test."
+ exit 0
+fi
+
+# size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+T=$((1024 * G))
+
+if [ -z $DB_DIR ]; then
+ echo "DB_DIR is not defined"
+ exit 0
+fi
+
+if [ -z $WAL_DIR ]; then
+ echo "WAL_DIR is not defined"
+ exit 0
+fi
+
+output_dir=${OUTPUT_DIR:-/tmp/}
+if [ ! -d $output_dir ]; then
+ mkdir -p $output_dir
+fi
+
+# all multithreaded tests run with sync=1 unless
+# $DB_BENCH_NO_SYNC is defined
+syncval="1"
+if [ ! -z $DB_BENCH_NO_SYNC ]; then
+ echo "Turning sync off for all multithreaded tests"
+ syncval="0";
+fi
+
+num_threads=${NUM_THREADS:-64}
+mb_written_per_sec=${MB_WRITE_PER_SEC:-0}
+# Only for tests that do range scans
+num_nexts_per_seek=${NUM_NEXTS_PER_SEEK:-10}
+cache_size=${CACHE_SIZE:-$((17179869184))}
+compression_max_dict_bytes=${COMPRESSION_MAX_DICT_BYTES:-0}
+compression_type=${COMPRESSION_TYPE:-zstd}
+duration=${DURATION:-0}
+
+num_keys=${NUM_KEYS:-8000000000}
+key_size=${KEY_SIZE:-20}
+value_size=${VALUE_SIZE:-400}
+block_size=${BLOCK_SIZE:-8192}
+
+const_params="
+ --db=$DB_DIR \
+ --wal_dir=$WAL_DIR \
+ \
+ --num=$num_keys \
+ --num_levels=6 \
+ --key_size=$key_size \
+ --value_size=$value_size \
+ --block_size=$block_size \
+ --cache_size=$cache_size \
+ --cache_numshardbits=6 \
+ --compression_max_dict_bytes=$compression_max_dict_bytes \
+ --compression_ratio=0.5 \
+ --compression_type=$compression_type \
+ --level_compaction_dynamic_level_bytes=true \
+ --bytes_per_sync=$((8 * M)) \
+ --cache_index_and_filter_blocks=0 \
+ --pin_l0_filter_and_index_blocks_in_cache=1 \
+ --benchmark_write_rate_limit=$(( 1024 * 1024 * $mb_written_per_sec )) \
+ \
+ --hard_rate_limit=3 \
+ --rate_limit_delay_max_milliseconds=1000000 \
+ --write_buffer_size=$((128 * M)) \
+ --target_file_size_base=$((128 * M)) \
+ --max_bytes_for_level_base=$((1 * G)) \
+ \
+ --verify_checksum=1 \
+ --delete_obsolete_files_period_micros=$((60 * M)) \
+ --max_bytes_for_level_multiplier=8 \
+ \
+ --statistics=0 \
+ --stats_per_interval=1 \
+ --stats_interval_seconds=60 \
+ --histogram=1 \
+ \
+ --memtablerep=skip_list \
+ --bloom_bits=10 \
+ --open_files=-1"
+
+l0_config="
+ --level0_file_num_compaction_trigger=4 \
+ --level0_stop_writes_trigger=20"
+
+if [ $duration -gt 0 ]; then
+ const_params="$const_params --duration=$duration"
+fi
+
+params_w="$const_params \
+ $l0_config \
+ --max_background_compactions=16 \
+ --max_write_buffer_number=8 \
+ --max_background_flushes=7"
+
+params_bulkload="$const_params \
+ --max_background_compactions=16 \
+ --max_write_buffer_number=8 \
+ --allow_concurrent_memtable_write=false \
+ --max_background_flushes=7 \
+ --level0_file_num_compaction_trigger=$((10 * M)) \
+ --level0_slowdown_writes_trigger=$((10 * M)) \
+ --level0_stop_writes_trigger=$((10 * M))"
+
+params_fillseq="$params_w \
+ --allow_concurrent_memtable_write=false"
+#
+# Tune values for level and universal compaction.
+# For universal compaction, these level0_* options mean total sorted of runs in
+# LSM. In level-based compaction, it means number of L0 files.
+#
+params_level_compact="$const_params \
+ --max_background_flushes=4 \
+ --max_write_buffer_number=4 \
+ --level0_file_num_compaction_trigger=4 \
+ --level0_slowdown_writes_trigger=16 \
+ --level0_stop_writes_trigger=20"
+
+params_univ_compact="$const_params \
+ --max_background_flushes=4 \
+ --max_write_buffer_number=4 \
+ --level0_file_num_compaction_trigger=8 \
+ --level0_slowdown_writes_trigger=16 \
+ --level0_stop_writes_trigger=20"
+
+function summarize_result {
+ test_out=$1
+ test_name=$2
+ bench_name=$3
+
+ # Note that this function assumes that the benchmark executes long enough so
+ # that "Compaction Stats" is written to stdout at least once. If it won't
+ # happen then empty output from grep when searching for "Sum" will cause
+ # syntax errors.
+ uptime=$( grep ^Uptime\(secs $test_out | tail -1 | awk '{ printf "%.0f", $2 }' )
+ stall_time=$( grep "^Cumulative stall" $test_out | tail -1 | awk '{ print $3 }' )
+ stall_pct=$( grep "^Cumulative stall" $test_out| tail -1 | awk '{ print $5 }' )
+ ops_sec=$( grep ^${bench_name} $test_out | awk '{ print $5 }' )
+ mb_sec=$( grep ^${bench_name} $test_out | awk '{ print $7 }' )
+ lo_wgb=$( grep "^ L0" $test_out | tail -1 | awk '{ print $9 }' )
+ sum_wgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ print $9 }' )
+ sum_size=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.1f", $3 / 1024.0 }' )
+ wamp=$( echo "scale=1; $sum_wgb / $lo_wgb" | bc )
+ wmb_ps=$( echo "scale=1; ( $sum_wgb * 1024.0 ) / $uptime" | bc )
+ usecs_op=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $3 }' )
+ p50=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.1f", $3 }' )
+ p75=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.1f", $5 }' )
+ p99=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $7 }' )
+ p999=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $9 }' )
+ p9999=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $11 }' )
+ echo -e "$ops_sec\t$mb_sec\t$sum_size\t$lo_wgb\t$sum_wgb\t$wamp\t$wmb_ps\t$usecs_op\t$p50\t$p75\t$p99\t$p999\t$p9999\t$uptime\t$stall_time\t$stall_pct\t$test_name" \
+ >> $output_dir/report.txt
+}
+
+function run_bulkload {
+ # This runs with a vector memtable and the WAL disabled to load faster. It is still crash safe and the
+ # client can discover where to restart a load after a crash. I think this is a good way to load.
+ echo "Bulk loading $num_keys random keys"
+ cmd="./db_bench --benchmarks=fillrandom \
+ --use_existing_db=0 \
+ --disable_auto_compactions=1 \
+ --sync=0 \
+ $params_bulkload \
+ --threads=1 \
+ --memtablerep=vector \
+ --allow_concurrent_memtable_write=false \
+ --disable_wal=1 \
+ --seed=$( date +%s ) \
+ 2>&1 | tee -a $output_dir/benchmark_bulkload_fillrandom.log"
+ echo $cmd | tee $output_dir/benchmark_bulkload_fillrandom.log
+ eval $cmd
+ summarize_result $output_dir/benchmark_bulkload_fillrandom.log bulkload fillrandom
+ echo "Compacting..."
+ cmd="./db_bench --benchmarks=compact \
+ --use_existing_db=1 \
+ --disable_auto_compactions=1 \
+ --sync=0 \
+ $params_w \
+ --threads=1 \
+ 2>&1 | tee -a $output_dir/benchmark_bulkload_compact.log"
+ echo $cmd | tee $output_dir/benchmark_bulkload_compact.log
+ eval $cmd
+}
+
+#
+# Parameter description:
+#
+# $1 - 1 if I/O statistics should be collected.
+# $2 - compaction type to use (level=0, universal=1).
+# $3 - number of subcompactions.
+# $4 - number of maximum background compactions.
+#
+function run_manual_compaction_worker {
+ # This runs with a vector memtable and the WAL disabled to load faster.
+ # It is still crash safe and the client can discover where to restart a
+ # load after a crash. I think this is a good way to load.
+ echo "Bulk loading $num_keys random keys for manual compaction."
+
+ fillrandom_output_file=$output_dir/benchmark_man_compact_fillrandom_$3.log
+ man_compact_output_log=$output_dir/benchmark_man_compact_$3.log
+
+ if [ "$2" == "1" ]; then
+ extra_params=$params_univ_compact
+ else
+ extra_params=$params_level_compact
+ fi
+
+ # Make sure that fillrandom uses the same compaction options as compact.
+ cmd="./db_bench --benchmarks=fillrandom \
+ --use_existing_db=0 \
+ --disable_auto_compactions=0 \
+ --sync=0 \
+ $extra_params \
+ --threads=$num_threads \
+ --compaction_measure_io_stats=$1 \
+ --compaction_style=$2 \
+ --subcompactions=$3 \
+ --memtablerep=vector \
+ --allow_concurrent_memtable_write=false \
+ --disable_wal=1 \
+ --max_background_compactions=$4 \
+ --seed=$( date +%s ) \
+ 2>&1 | tee -a $fillrandom_output_file"
+
+ echo $cmd | tee $fillrandom_output_file
+ eval $cmd
+
+ summarize_result $fillrandom_output_file man_compact_fillrandom_$3 fillrandom
+
+ echo "Compacting with $3 subcompactions specified ..."
+
+ # This is the part we're really interested in. Given that compact benchmark
+ # doesn't output regular statistics then we'll just use the time command to
+ # measure how long this step takes.
+ cmd="{ \
+ time ./db_bench --benchmarks=compact \
+ --use_existing_db=1 \
+ --disable_auto_compactions=0 \
+ --sync=0 \
+ $extra_params \
+ --threads=$num_threads \
+ --compaction_measure_io_stats=$1 \
+ --compaction_style=$2 \
+ --subcompactions=$3 \
+ --max_background_compactions=$4 \
+ ;}
+ 2>&1 | tee -a $man_compact_output_log"
+
+ echo $cmd | tee $man_compact_output_log
+ eval $cmd
+
+ # Can't use summarize_result here. One way to analyze the results is to run
+ # "grep real" on the resulting log files.
+}
+
+function run_univ_compaction {
+ # Always ask for I/O statistics to be measured.
+ io_stats=1
+
+ # Values: kCompactionStyleLevel = 0x0, kCompactionStyleUniversal = 0x1.
+ compaction_style=1
+
+ # Define a set of benchmarks.
+ subcompactions=(1 2 4 8 16)
+ max_background_compactions=(16 16 8 4 2)
+
+ i=0
+ total=${#subcompactions[@]}
+
+ # Execute a set of benchmarks to cover variety of scenarios.
+ while [ "$i" -lt "$total" ]
+ do
+ run_manual_compaction_worker $io_stats $compaction_style ${subcompactions[$i]} \
+ ${max_background_compactions[$i]}
+ ((i++))
+ done
+}
+
+function run_fillseq {
+ # This runs with a vector memtable. WAL can be either disabled or enabled
+ # depending on the input parameter (1 for disabled, 0 for enabled). The main
+ # benefit behind disabling WAL is to make loading faster. It is still crash
+ # safe and the client can discover where to restart a load after a crash. I
+ # think this is a good way to load.
+
+ # Make sure that we'll have unique names for all the files so that data won't
+ # be overwritten.
+ if [ $1 == 1 ]; then
+ log_file_name=$output_dir/benchmark_fillseq.wal_disabled.v${value_size}.log
+ test_name=fillseq.wal_disabled.v${value_size}
+ else
+ log_file_name=$output_dir/benchmark_fillseq.wal_enabled.v${value_size}.log
+ test_name=fillseq.wal_enabled.v${value_size}
+ fi
+
+ echo "Loading $num_keys keys sequentially"
+ cmd="./db_bench --benchmarks=fillseq \
+ --use_existing_db=0 \
+ --sync=0 \
+ $params_fillseq \
+ --min_level_to_compress=0 \
+ --threads=1 \
+ --memtablerep=vector \
+ --allow_concurrent_memtable_write=false \
+ --disable_wal=$1 \
+ --seed=$( date +%s ) \
+ 2>&1 | tee -a $log_file_name"
+ echo $cmd | tee $log_file_name
+ eval $cmd
+
+ # The constant "fillseq" which we pass to db_bench is the benchmark name.
+ summarize_result $log_file_name $test_name fillseq
+}
+
+function run_change {
+ operation=$1
+ echo "Do $num_keys random $operation"
+ out_name="benchmark_${operation}.t${num_threads}.s${syncval}.log"
+ cmd="./db_bench --benchmarks=$operation \
+ --use_existing_db=1 \
+ --sync=$syncval \
+ $params_w \
+ --threads=$num_threads \
+ --merge_operator=\"put\" \
+ --seed=$( date +%s ) \
+ 2>&1 | tee -a $output_dir/${out_name}"
+ echo $cmd | tee $output_dir/${out_name}
+ eval $cmd
+ summarize_result $output_dir/${out_name} ${operation}.t${num_threads}.s${syncval} $operation
+}
+
+function run_filluniquerandom {
+ echo "Loading $num_keys unique keys randomly"
+ cmd="./db_bench --benchmarks=filluniquerandom \
+ --use_existing_db=0 \
+ --sync=0 \
+ $params_w \
+ --threads=1 \
+ --seed=$( date +%s ) \
+ 2>&1 | tee -a $output_dir/benchmark_filluniquerandom.log"
+ echo $cmd | tee $output_dir/benchmark_filluniquerandom.log
+ eval $cmd
+ summarize_result $output_dir/benchmark_filluniquerandom.log filluniquerandom filluniquerandom
+}
+
+function run_readrandom {
+ echo "Reading $num_keys random keys"
+ out_name="benchmark_readrandom.t${num_threads}.log"
+ cmd="./db_bench --benchmarks=readrandom \
+ --use_existing_db=1 \
+ $params_w \
+ --threads=$num_threads \
+ --seed=$( date +%s ) \
+ 2>&1 | tee -a $output_dir/${out_name}"
+ echo $cmd | tee $output_dir/${out_name}
+ eval $cmd
+ summarize_result $output_dir/${out_name} readrandom.t${num_threads} readrandom
+}
+
+function run_readwhile {
+ operation=$1
+ echo "Reading $num_keys random keys while $operation"
+ out_name="benchmark_readwhile${operation}.t${num_threads}.log"
+ cmd="./db_bench --benchmarks=readwhile${operation} \
+ --use_existing_db=1 \
+ --sync=$syncval \
+ $params_w \
+ --threads=$num_threads \
+ --merge_operator=\"put\" \
+ --seed=$( date +%s ) \
+ 2>&1 | tee -a $output_dir/${out_name}"
+ echo $cmd | tee $output_dir/${out_name}
+ eval $cmd
+ summarize_result $output_dir/${out_name} readwhile${operation}.t${num_threads} readwhile${operation}
+}
+
+function run_rangewhile {
+ operation=$1
+ full_name=$2
+ reverse_arg=$3
+ out_name="benchmark_${full_name}.t${num_threads}.log"
+ echo "Range scan $num_keys random keys while ${operation} for reverse_iter=${reverse_arg}"
+ cmd="./db_bench --benchmarks=seekrandomwhile${operation} \
+ --use_existing_db=1 \
+ --sync=$syncval \
+ $params_w \
+ --threads=$num_threads \
+ --merge_operator=\"put\" \
+ --seek_nexts=$num_nexts_per_seek \
+ --reverse_iterator=$reverse_arg \
+ --seed=$( date +%s ) \
+ 2>&1 | tee -a $output_dir/${out_name}"
+ echo $cmd | tee $output_dir/${out_name}
+ eval $cmd
+ summarize_result $output_dir/${out_name} ${full_name}.t${num_threads} seekrandomwhile${operation}
+}
+
+function run_range {
+ full_name=$1
+ reverse_arg=$2
+ out_name="benchmark_${full_name}.t${num_threads}.log"
+ echo "Range scan $num_keys random keys for reverse_iter=${reverse_arg}"
+ cmd="./db_bench --benchmarks=seekrandom \
+ --use_existing_db=1 \
+ $params_w \
+ --threads=$num_threads \
+ --seek_nexts=$num_nexts_per_seek \
+ --reverse_iterator=$reverse_arg \
+ --seed=$( date +%s ) \
+ 2>&1 | tee -a $output_dir/${out_name}"
+ echo $cmd | tee $output_dir/${out_name}
+ eval $cmd
+ summarize_result $output_dir/${out_name} ${full_name}.t${num_threads} seekrandom
+}
+
+function run_randomtransaction {
+ echo "..."
+ cmd="./db_bench $params_r --benchmarks=randomtransaction \
+ --num=$num_keys \
+ --transaction_db \
+ --threads=5 \
+ --transaction_sets=5 \
+ 2>&1 | tee $output_dir/benchmark_randomtransaction.log"
+ echo $cmd | tee $output_dir/benchmark_rangescanwhilewriting.log
+ eval $cmd
+}
+
+function now() {
+ echo `date +"%s"`
+}
+
+report="$output_dir/report.txt"
+schedule="$output_dir/schedule.txt"
+
+echo "===== Benchmark ====="
+
+# Run!!!
+IFS=',' read -a jobs <<< $1
+# shellcheck disable=SC2068
+for job in ${jobs[@]}; do
+
+ if [ $job != debug ]; then
+ echo "Start $job at `date`" | tee -a $schedule
+ fi
+
+ start=$(now)
+ if [ $job = bulkload ]; then
+ run_bulkload
+ elif [ $job = fillseq_disable_wal ]; then
+ run_fillseq 1
+ elif [ $job = fillseq_enable_wal ]; then
+ run_fillseq 0
+ elif [ $job = overwrite ]; then
+ syncval="0"
+ params_w="$params_w \
+ --writes=125000000 \
+ --subcompactions=4 \
+ --soft_pending_compaction_bytes_limit=$((1 * T)) \
+ --hard_pending_compaction_bytes_limit=$((4 * T)) "
+ run_change overwrite
+ elif [ $job = updaterandom ]; then
+ run_change updaterandom
+ elif [ $job = mergerandom ]; then
+ run_change mergerandom
+ elif [ $job = filluniquerandom ]; then
+ run_filluniquerandom
+ elif [ $job = readrandom ]; then
+ run_readrandom
+ elif [ $job = fwdrange ]; then
+ run_range $job false
+ elif [ $job = revrange ]; then
+ run_range $job true
+ elif [ $job = readwhilewriting ]; then
+ run_readwhile writing
+ elif [ $job = readwhilemerging ]; then
+ run_readwhile merging
+ elif [ $job = fwdrangewhilewriting ]; then
+ run_rangewhile writing $job false
+ elif [ $job = revrangewhilewriting ]; then
+ run_rangewhile writing $job true
+ elif [ $job = fwdrangewhilemerging ]; then
+ run_rangewhile merging $job false
+ elif [ $job = revrangewhilemerging ]; then
+ run_rangewhile merging $job true
+ elif [ $job = randomtransaction ]; then
+ run_randomtransaction
+ elif [ $job = universal_compaction ]; then
+ run_univ_compaction
+ elif [ $job = debug ]; then
+ num_keys=1000; # debug
+ echo "Setting num_keys to $num_keys"
+ else
+ echo "unknown job $job"
+ exit
+ fi
+ end=$(now)
+
+ if [ $job != debug ]; then
+ echo "Complete $job in $((end-start)) seconds" | tee -a $schedule
+ fi
+
+ echo -e "ops/sec\tmb/sec\tSize-GB\tL0_GB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tp99.9\tp99.99\tUptime\tStall-time\tStall%\tTest"
+ tail -1 $output_dir/report.txt
+
+done
diff --git a/src/rocksdb/tools/benchmark_leveldb.sh b/src/rocksdb/tools/benchmark_leveldb.sh
new file mode 100755
index 000000000..069b53a9f
--- /dev/null
+++ b/src/rocksdb/tools/benchmark_leveldb.sh
@@ -0,0 +1,187 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# REQUIRE: db_bench binary exists in the current directory
+#
+# This should be used with the LevelDB fork listed here to use additional test options.
+# For more details on the changes see the blog post listed below.
+# https://github.com/mdcallag/leveldb-1
+# http://smalldatum.blogspot.com/2015/04/comparing-leveldb-and-rocksdb-take-2.html
+
+if [ $# -ne 1 ]; then
+ echo -n "./benchmark.sh [fillseq/overwrite/readrandom/readwhilewriting]"
+ exit 0
+fi
+
+# size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+
+if [ -z $DB_DIR ]; then
+ echo "DB_DIR is not defined"
+ exit 0
+fi
+
+output_dir=${OUTPUT_DIR:-/tmp/}
+if [ ! -d $output_dir ]; then
+ mkdir -p $output_dir
+fi
+
+# all multithreaded tests run with sync=1 unless
+# $DB_BENCH_NO_SYNC is defined
+syncval="1"
+if [ ! -z $DB_BENCH_NO_SYNC ]; then
+ echo "Turning sync off for all multithreaded tests"
+ syncval="0";
+fi
+
+num_threads=${NUM_THREADS:-16}
+# Only for *whilewriting, *whilemerging
+writes_per_second=${WRITES_PER_SECOND:-$((10 * K))}
+cache_size=${CACHE_SIZE:-$((1 * G))}
+
+num_keys=${NUM_KEYS:-$((1 * G))}
+key_size=20
+value_size=${VALUE_SIZE:-400}
+block_size=${BLOCK_SIZE:-4096}
+
+const_params="
+ --db=$DB_DIR \
+ \
+ --num=$num_keys \
+ --value_size=$value_size \
+ --cache_size=$cache_size \
+ --compression_ratio=0.5 \
+ \
+ --write_buffer_size=$((2 * M)) \
+ \
+ --histogram=1 \
+ \
+ --bloom_bits=10 \
+ --open_files=$((20 * K))"
+
+params_w="$const_params "
+
+function summarize_result {
+ test_out=$1
+ test_name=$2
+ bench_name=$3
+ nthr=$4
+
+ usecs_op=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $3 }' )
+ mb_sec=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $5 }' )
+ ops=$( grep "^Count:" $test_out | awk '{ print $2 }' )
+ ops_sec=$( echo "scale=0; (1000000.0 * $nthr) / $usecs_op" | bc )
+ avg=$( grep "^Count:" $test_out | awk '{ printf "%.1f", $4 }' )
+ p50=$( grep "^Min:" $test_out | awk '{ printf "%.1f", $4 }' )
+ echo -e "$ops_sec\t$mb_sec\t$usecs_op\t$avg\t$p50\t$test_name" \
+ >> $output_dir/report.txt
+}
+
+function run_fillseq {
+ # This runs with a vector memtable and the WAL disabled to load faster. It is still crash safe and the
+ # client can discover where to restart a load after a crash. I think this is a good way to load.
+ echo "Loading $num_keys keys sequentially"
+ cmd="./db_bench --benchmarks=fillseq \
+ --use_existing_db=0 \
+ --sync=0 \
+ $params_w \
+ --threads=1 \
+ --seed=$( date +%s ) \
+ 2>&1 | tee -a $output_dir/benchmark_fillseq.v${value_size}.log"
+ echo $cmd | tee $output_dir/benchmark_fillseq.v${value_size}.log
+ eval $cmd
+ summarize_result $output_dir/benchmark_fillseq.v${value_size}.log fillseq.v${value_size} fillseq 1
+}
+
+function run_change {
+ operation=$1
+ echo "Do $num_keys random $operation"
+ out_name="benchmark_${operation}.t${num_threads}.s${syncval}.log"
+ cmd="./db_bench --benchmarks=$operation \
+ --use_existing_db=1 \
+ --sync=$syncval \
+ $params_w \
+ --threads=$num_threads \
+ --seed=$( date +%s ) \
+ 2>&1 | tee -a $output_dir/${out_name}"
+ echo $cmd | tee $output_dir/${out_name}
+ eval $cmd
+ summarize_result $output_dir/${out_name} ${operation}.t${num_threads}.s${syncval} $operation $num_threads
+}
+
+function run_readrandom {
+ echo "Reading $num_keys random keys"
+ out_name="benchmark_readrandom.t${num_threads}.log"
+ cmd="./db_bench --benchmarks=readrandom \
+ --use_existing_db=1 \
+ $params_w \
+ --threads=$num_threads \
+ --seed=$( date +%s ) \
+ 2>&1 | tee -a $output_dir/${out_name}"
+ echo $cmd | tee $output_dir/${out_name}
+ eval $cmd
+ summarize_result $output_dir/${out_name} readrandom.t${num_threads} readrandom $num_threads
+}
+
+function run_readwhile {
+ operation=$1
+ echo "Reading $num_keys random keys while $operation"
+ out_name="benchmark_readwhile${operation}.t${num_threads}.log"
+ cmd="./db_bench --benchmarks=readwhile${operation} \
+ --use_existing_db=1 \
+ --sync=$syncval \
+ $params_w \
+ --threads=$num_threads \
+ --writes_per_second=$writes_per_second \
+ --seed=$( date +%s ) \
+ 2>&1 | tee -a $output_dir/${out_name}"
+ echo $cmd | tee $output_dir/${out_name}
+ eval $cmd
+ summarize_result $output_dir/${out_name} readwhile${operation}.t${num_threads} readwhile${operation} $num_threads
+}
+
+function now() {
+ echo `date +"%s"`
+}
+
+report="$output_dir/report.txt"
+schedule="$output_dir/schedule.txt"
+
+echo "===== Benchmark ====="
+
+# Run!!!
+IFS=',' read -a jobs <<< $1
+# shellcheck disable=SC2068
+for job in ${jobs[@]}; do
+
+ if [ $job != debug ]; then
+ echo "Start $job at `date`" | tee -a $schedule
+ fi
+
+ start=$(now)
+ if [ $job = fillseq ]; then
+ run_fillseq
+ elif [ $job = overwrite ]; then
+ run_change overwrite
+ elif [ $job = readrandom ]; then
+ run_readrandom
+ elif [ $job = readwhilewriting ]; then
+ run_readwhile writing
+ elif [ $job = debug ]; then
+ num_keys=1000; # debug
+ echo "Setting num_keys to $num_keys"
+ else
+ echo "unknown job $job"
+ exit
+ fi
+ end=$(now)
+
+ if [ $job != debug ]; then
+ echo "Complete $job in $((end-start)) seconds" | tee -a $schedule
+ fi
+
+ echo -e "ops/sec\tmb/sec\tusec/op\tavg\tp50\tTest"
+ tail -1 $output_dir/report.txt
+
+done
diff --git a/src/rocksdb/tools/blob_dump.cc b/src/rocksdb/tools/blob_dump.cc
new file mode 100644
index 000000000..2ae73ecfe
--- /dev/null
+++ b/src/rocksdb/tools/blob_dump.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include <getopt.h>
+#include <cstdio>
+#include <string>
+#include <unordered_map>
+
+#include "utilities/blob_db/blob_dump_tool.h"
+
+using namespace ROCKSDB_NAMESPACE;
+using namespace ROCKSDB_NAMESPACE::blob_db;
+
+int main(int argc, char** argv) {
+ using DisplayType = BlobDumpTool::DisplayType;
+ const std::unordered_map<std::string, DisplayType> display_types = {
+ {"none", DisplayType::kNone},
+ {"raw", DisplayType::kRaw},
+ {"hex", DisplayType::kHex},
+ {"detail", DisplayType::kDetail},
+ };
+ const struct option options[] = {
+ {"help", no_argument, nullptr, 'h'},
+ {"file", required_argument, nullptr, 'f'},
+ {"show_key", optional_argument, nullptr, 'k'},
+ {"show_blob", optional_argument, nullptr, 'b'},
+ {"show_uncompressed_blob", optional_argument, nullptr, 'r'},
+ {"show_summary", optional_argument, nullptr, 's'},
+ };
+ DisplayType show_key = DisplayType::kRaw;
+ DisplayType show_blob = DisplayType::kNone;
+ DisplayType show_uncompressed_blob = DisplayType::kNone;
+ bool show_summary = false;
+ std::string file;
+ while (true) {
+ int c = getopt_long(argc, argv, "hk::b::f:", options, nullptr);
+ if (c < 0) {
+ break;
+ }
+ std::string arg_str(optarg ? optarg : "");
+ switch (c) {
+ case 'h':
+ fprintf(stdout,
+ "Usage: blob_dump --file=filename "
+ "[--show_key[=none|raw|hex|detail]] "
+ "[--show_blob[=none|raw|hex|detail]] "
+ "[--show_uncompressed_blob[=none|raw|hex|detail]] "
+ "[--show_summary]\n");
+ return 0;
+ case 'f':
+ file = optarg;
+ break;
+ case 'k':
+ if (optarg) {
+ if (display_types.count(arg_str) == 0) {
+ fprintf(stderr, "Unrecognized key display type.\n");
+ return -1;
+ }
+ show_key = display_types.at(arg_str);
+ }
+ break;
+ case 'b':
+ if (optarg) {
+ if (display_types.count(arg_str) == 0) {
+ fprintf(stderr, "Unrecognized blob display type.\n");
+ return -1;
+ }
+ show_blob = display_types.at(arg_str);
+ } else {
+ show_blob = DisplayType::kHex;
+ }
+ break;
+ case 'r':
+ if (optarg) {
+ if (display_types.count(arg_str) == 0) {
+ fprintf(stderr, "Unrecognized blob display type.\n");
+ return -1;
+ }
+ show_uncompressed_blob = display_types.at(arg_str);
+ } else {
+ show_uncompressed_blob = DisplayType::kHex;
+ }
+ break;
+ case 's':
+ show_summary = true;
+ break;
+ default:
+ fprintf(stderr, "Unrecognized option.\n");
+ return -1;
+ }
+ }
+ BlobDumpTool tool;
+ Status s =
+ tool.Run(file, show_key, show_blob, show_uncompressed_blob, show_summary);
+ if (!s.ok()) {
+ fprintf(stderr, "Failed: %s\n", s.ToString().c_str());
+ return -1;
+ }
+ return 0;
+}
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "Not supported in lite mode.\n");
+ return -1;
+}
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/block_cache_analyzer/__init__.py b/src/rocksdb/tools/block_cache_analyzer/__init__.py
new file mode 100644
index 000000000..8dbe96a78
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/__init__.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py
new file mode 100644
index 000000000..67307df53
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py
@@ -0,0 +1,2000 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import gc
+import heapq
+import random
+import sys
+import time
+from collections import OrderedDict
+from os import path
+
+import numpy as np
+
+
+kSampleSize = 64 # The sample size used when performing eviction.
+kMicrosInSecond = 1000000
+kSecondsInMinute = 60
+kSecondsInHour = 3600
+
+
+class TraceRecord:
+ """
+ A trace record represents a block access.
+ It holds the same struct as BlockCacheTraceRecord in
+ trace_replay/block_cache_tracer.h
+ """
+
+ def __init__(
+ self,
+ access_time,
+ block_id,
+ block_type,
+ block_size,
+ cf_id,
+ cf_name,
+ level,
+ fd,
+ caller,
+ no_insert,
+ get_id,
+ key_id,
+ kv_size,
+ is_hit,
+ referenced_key_exist_in_block,
+ num_keys_in_block,
+ table_id,
+ seq_number,
+ block_key_size,
+ key_size,
+ block_offset_in_file,
+ next_access_seq_no,
+ ):
+ self.access_time = access_time
+ self.block_id = block_id
+ self.block_type = block_type
+ self.block_size = block_size + block_key_size
+ self.cf_id = cf_id
+ self.cf_name = cf_name
+ self.level = level
+ self.fd = fd
+ self.caller = caller
+ if no_insert == 1:
+ self.no_insert = True
+ else:
+ self.no_insert = False
+ self.get_id = get_id
+ self.key_id = key_id
+ self.kv_size = kv_size
+ if is_hit == 1:
+ self.is_hit = True
+ else:
+ self.is_hit = False
+ if referenced_key_exist_in_block == 1:
+ self.referenced_key_exist_in_block = True
+ else:
+ self.referenced_key_exist_in_block = False
+ self.num_keys_in_block = num_keys_in_block
+ self.table_id = table_id
+ self.seq_number = seq_number
+ self.block_key_size = block_key_size
+ self.key_size = key_size
+ self.block_offset_in_file = block_offset_in_file
+ self.next_access_seq_no = next_access_seq_no
+
+
+class CacheEntry:
+ """A cache entry stored in the cache."""
+
+ def __init__(
+ self,
+ value_size,
+ cf_id,
+ level,
+ block_type,
+ table_id,
+ access_number,
+ time_s,
+ num_hits=0,
+ ):
+ self.value_size = value_size
+ self.last_access_number = access_number
+ self.num_hits = num_hits
+ self.cf_id = 0
+ self.level = level
+ self.block_type = block_type
+ self.last_access_time = time_s
+ self.insertion_time = time_s
+ self.table_id = table_id
+
+ def __repr__(self):
+ """Debug string."""
+ return "(s={},last={},hits={},cf={},l={},bt={})\n".format(
+ self.value_size,
+ self.last_access_number,
+ self.num_hits,
+ self.cf_id,
+ self.level,
+ self.block_type,
+ )
+
+ def cost_class(self, cost_class_label):
+ if cost_class_label == "table_bt":
+ return "{}-{}".format(self.table_id, self.block_type)
+ elif cost_class_label == "table":
+ return "{}".format(self.table_id)
+ elif cost_class_label == "bt":
+ return "{}".format(self.block_type)
+ elif cost_class_label == "cf":
+ return "{}".format(self.cf_id)
+ elif cost_class_label == "cf_bt":
+ return "{}-{}".format(self.cf_id, self.block_type)
+ elif cost_class_label == "table_level_bt":
+ return "{}-{}-{}".format(self.table_id, self.level, self.block_type)
+ assert False, "Unknown cost class label {}".format(cost_class_label)
+ return None
+
+
+class HashEntry:
+ """A hash entry stored in a hash table."""
+
+ def __init__(self, key, hash, value):
+ self.key = key
+ self.hash = hash
+ self.value = value
+
+ def __repr__(self):
+ return "k={},h={},v=[{}]".format(self.key, self.hash, self.value)
+
+
+class HashTable:
+ """
+ A custom implementation of hash table to support fast random sampling.
+ It is closed hashing and uses chaining to resolve hash conflicts.
+ It grows/shrinks the hash table upon insertion/deletion to support
+ fast lookups and random samplings.
+ """
+
+ def __init__(self):
+ self.initial_size = 32
+ self.table = [None] * self.initial_size
+ self.elements = 0
+
+ def random_sample(self, sample_size):
+ """Randomly sample 'sample_size' hash entries from the table."""
+ samples = []
+ index = random.randint(0, len(self.table) - 1)
+ pos = index
+ # Starting from index, adding hash entries to the sample list until
+ # sample_size is met or we ran out of entries.
+ while True:
+ if self.table[pos] is not None:
+ for i in range(len(self.table[pos])):
+ if self.table[pos][i] is None:
+ continue
+ samples.append(self.table[pos][i])
+ if len(samples) == sample_size:
+ break
+ pos += 1
+ pos = pos % len(self.table)
+ if pos == index or len(samples) == sample_size:
+ break
+ assert len(samples) <= sample_size
+ return samples
+
+ def __repr__(self):
+ all_entries = []
+ for i in range(len(self.table)):
+ if self.table[i] is None:
+ continue
+ for j in range(len(self.table[i])):
+ if self.table[i][j] is not None:
+ all_entries.append(self.table[i][j])
+ return "{}".format(all_entries)
+
+ def values(self):
+ all_values = []
+ for i in range(len(self.table)):
+ if self.table[i] is None:
+ continue
+ for j in range(len(self.table[i])):
+ if self.table[i][j] is not None:
+ all_values.append(self.table[i][j].value)
+ return all_values
+
+ def __len__(self):
+ return self.elements
+
+ def insert(self, key, hash, value):
+ """
+ Insert a hash entry in the table. Replace the old entry if it already
+ exists.
+ """
+ self.grow()
+ inserted = False
+ index = hash % len(self.table)
+ if self.table[index] is None:
+ self.table[index] = []
+ # Search for the entry first.
+ for i in range(len(self.table[index])):
+ if self.table[index][i] is None:
+ continue
+ if self.table[index][i].hash == hash and self.table[index][i].key == key:
+ # The entry already exists in the table.
+ self.table[index][i] = HashEntry(key, hash, value)
+ return
+
+ # Find an empty slot.
+ for i in range(len(self.table[index])):
+ if self.table[index][i] is None:
+ self.table[index][i] = HashEntry(key, hash, value)
+ inserted = True
+ break
+ if not inserted:
+ self.table[index].append(HashEntry(key, hash, value))
+ self.elements += 1
+
+ def resize(self, new_size):
+ if new_size == len(self.table):
+ return
+ if new_size < self.initial_size:
+ return
+ if self.elements < 100:
+ return
+ new_table = [None] * new_size
+ # Copy 'self.table' to new_table.
+ for i in range(len(self.table)):
+ entries = self.table[i]
+ if entries is None:
+ continue
+ for j in range(len(entries)):
+ if entries[j] is None:
+ continue
+ index = entries[j].hash % new_size
+ if new_table[index] is None:
+ new_table[index] = []
+ new_table[index].append(entries[j])
+ self.table = new_table
+ del new_table
+ # Manually call python gc here to free the memory as 'self.table'
+ # might be very large.
+ gc.collect()
+
+ def grow(self):
+ if self.elements < 4 * len(self.table):
+ return
+ new_size = int(len(self.table) * 1.5)
+ self.resize(new_size)
+
+ def delete(self, key, hash):
+ index = hash % len(self.table)
+ deleted = False
+ deleted_entry = None
+ if self.table[index] is None:
+ return
+ for i in range(len(self.table[index])):
+ if (
+ self.table[index][i] is not None
+ and self.table[index][i].hash == hash
+ and self.table[index][i].key == key
+ ):
+ deleted_entry = self.table[index][i]
+ self.table[index][i] = None
+ self.elements -= 1
+ deleted = True
+ break
+ if deleted:
+ self.shrink()
+ return deleted_entry
+
+ def shrink(self):
+ if self.elements * 2 >= len(self.table):
+ return
+ new_size = int(len(self.table) * 0.7)
+ self.resize(new_size)
+
+ def lookup(self, key, hash):
+ index = hash % len(self.table)
+ if self.table[index] is None:
+ return None
+ for i in range(len(self.table[index])):
+ if (
+ self.table[index][i] is not None
+ and self.table[index][i].hash == hash
+ and self.table[index][i].key == key
+ ):
+ return self.table[index][i].value
+ return None
+
+
+class MissRatioStats:
+ def __init__(self, time_unit):
+ self.num_misses = 0
+ self.num_accesses = 0
+ self.time_unit = time_unit
+ self.time_misses = {}
+ self.time_miss_bytes = {}
+ self.time_accesses = {}
+
+ def update_metrics(self, access_time, is_hit, miss_bytes):
+ access_time /= kMicrosInSecond * self.time_unit
+ self.num_accesses += 1
+ if access_time not in self.time_accesses:
+ self.time_accesses[access_time] = 0
+ self.time_accesses[access_time] += 1
+ if not is_hit:
+ self.num_misses += 1
+ if access_time not in self.time_misses:
+ self.time_misses[access_time] = 0
+ self.time_miss_bytes[access_time] = 0
+ self.time_misses[access_time] += 1
+ self.time_miss_bytes[access_time] += miss_bytes
+
+ def reset_counter(self):
+ self.num_misses = 0
+ self.num_accesses = 0
+ self.time_miss_bytes.clear()
+ self.time_misses.clear()
+ self.time_accesses.clear()
+
+ def compute_miss_bytes(self):
+ miss_bytes = []
+ for at in self.time_miss_bytes:
+ miss_bytes.append(self.time_miss_bytes[at])
+ miss_bytes = sorted(miss_bytes)
+ avg_miss_bytes = 0
+ p95_miss_bytes = 0
+ for i in range(len(miss_bytes)):
+ avg_miss_bytes += float(miss_bytes[i]) / float(len(miss_bytes))
+
+ p95_index = min(int(0.95 * float(len(miss_bytes))), len(miss_bytes) - 1)
+ p95_miss_bytes = miss_bytes[p95_index]
+ return avg_miss_bytes, p95_miss_bytes
+
+ def miss_ratio(self):
+ return float(self.num_misses) * 100.0 / float(self.num_accesses)
+
+ def write_miss_timeline(
+ self, cache_type, cache_size, target_cf_name, result_dir, start, end
+ ):
+ start /= kMicrosInSecond * self.time_unit
+ end /= kMicrosInSecond * self.time_unit
+ header_file_path = "{}/header-ml-miss-timeline-{}-{}-{}-{}".format(
+ result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+ )
+ if not path.exists(header_file_path):
+ with open(header_file_path, "w+") as header_file:
+ header = "time"
+ for trace_time in range(start, end):
+ header += ",{}".format(trace_time)
+ header_file.write(header + "\n")
+ file_path = "{}/data-ml-miss-timeline-{}-{}-{}-{}".format(
+ result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+ )
+ with open(file_path, "w+") as file:
+ row = "{}".format(cache_type)
+ for trace_time in range(start, end):
+ row += ",{}".format(self.time_misses.get(trace_time, 0))
+ file.write(row + "\n")
+
+ def write_miss_ratio_timeline(
+ self, cache_type, cache_size, target_cf_name, result_dir, start, end
+ ):
+ start /= kMicrosInSecond * self.time_unit
+ end /= kMicrosInSecond * self.time_unit
+ header_file_path = "{}/header-ml-miss-ratio-timeline-{}-{}-{}-{}".format(
+ result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+ )
+ if not path.exists(header_file_path):
+ with open(header_file_path, "w+") as header_file:
+ header = "time"
+ for trace_time in range(start, end):
+ header += ",{}".format(trace_time)
+ header_file.write(header + "\n")
+ file_path = "{}/data-ml-miss-ratio-timeline-{}-{}-{}-{}".format(
+ result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+ )
+ with open(file_path, "w+") as file:
+ row = "{}".format(cache_type)
+ for trace_time in range(start, end):
+ naccesses = self.time_accesses.get(trace_time, 0)
+ miss_ratio = 0
+ if naccesses > 0:
+ miss_ratio = float(
+ self.time_misses.get(trace_time, 0) * 100.0
+ ) / float(naccesses)
+ row += ",{0:.2f}".format(miss_ratio)
+ file.write(row + "\n")
+
+
+class PolicyStats:
+ def __init__(self, time_unit, policies):
+ self.time_selected_polices = {}
+ self.time_accesses = {}
+ self.policy_names = {}
+ self.time_unit = time_unit
+ for i in range(len(policies)):
+ self.policy_names[i] = policies[i].policy_name()
+
+ def update_metrics(self, access_time, selected_policy):
+ access_time /= kMicrosInSecond * self.time_unit
+ if access_time not in self.time_accesses:
+ self.time_accesses[access_time] = 0
+ self.time_accesses[access_time] += 1
+ if access_time not in self.time_selected_polices:
+ self.time_selected_polices[access_time] = {}
+ policy_name = self.policy_names[selected_policy]
+ if policy_name not in self.time_selected_polices[access_time]:
+ self.time_selected_polices[access_time][policy_name] = 0
+ self.time_selected_polices[access_time][policy_name] += 1
+
+ def write_policy_timeline(
+ self, cache_type, cache_size, target_cf_name, result_dir, start, end
+ ):
+ start /= kMicrosInSecond * self.time_unit
+ end /= kMicrosInSecond * self.time_unit
+ header_file_path = "{}/header-ml-policy-timeline-{}-{}-{}-{}".format(
+ result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+ )
+ if not path.exists(header_file_path):
+ with open(header_file_path, "w+") as header_file:
+ header = "time"
+ for trace_time in range(start, end):
+ header += ",{}".format(trace_time)
+ header_file.write(header + "\n")
+ file_path = "{}/data-ml-policy-timeline-{}-{}-{}-{}".format(
+ result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+ )
+ with open(file_path, "w+") as file:
+ for policy in self.policy_names:
+ policy_name = self.policy_names[policy]
+ row = "{}-{}".format(cache_type, policy_name)
+ for trace_time in range(start, end):
+ row += ",{}".format(
+ self.time_selected_polices.get(trace_time, {}).get(
+ policy_name, 0
+ )
+ )
+ file.write(row + "\n")
+
+ def write_policy_ratio_timeline(
+ self, cache_type, cache_size, target_cf_name, file_path, start, end
+ ):
+ start /= kMicrosInSecond * self.time_unit
+ end /= kMicrosInSecond * self.time_unit
+ header_file_path = "{}/header-ml-policy-ratio-timeline-{}-{}-{}-{}".format(
+ result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+ )
+ if not path.exists(header_file_path):
+ with open(header_file_path, "w+") as header_file:
+ header = "time"
+ for trace_time in range(start, end):
+ header += ",{}".format(trace_time)
+ header_file.write(header + "\n")
+ file_path = "{}/data-ml-policy-ratio-timeline-{}-{}-{}-{}".format(
+ result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+ )
+ with open(file_path, "w+") as file:
+ for policy in self.policy_names:
+ policy_name = self.policy_names[policy]
+ row = "{}-{}".format(cache_type, policy_name)
+ for trace_time in range(start, end):
+ naccesses = self.time_accesses.get(trace_time, 0)
+ ratio = 0
+ if naccesses > 0:
+ ratio = float(
+ self.time_selected_polices.get(trace_time, {}).get(
+ policy_name, 0
+ )
+ * 100.0
+ ) / float(naccesses)
+ row += ",{0:.2f}".format(ratio)
+ file.write(row + "\n")
+
+
+class Policy(object):
+ """
+ A policy maintains a set of evicted keys. It returns a reward of one to
+ itself if it has not evicted a missing key. Otherwise, it gives itself 0
+ reward.
+ """
+
+ def __init__(self):
+ self.evicted_keys = {}
+
+ def evict(self, key, max_size):
+ self.evicted_keys[key] = 0
+
+ def delete(self, key):
+ self.evicted_keys.pop(key, None)
+
+ def prioritize_samples(self, samples, auxilliary_info):
+ raise NotImplementedError
+
+ def policy_name(self):
+ raise NotImplementedError
+
+ def generate_reward(self, key):
+ if key in self.evicted_keys:
+ return 0
+ return 1
+
+
+class LRUPolicy(Policy):
+ def prioritize_samples(self, samples, auxilliary_info):
+ return sorted(
+ samples,
+ cmp=lambda e1, e2: e1.value.last_access_number
+ - e2.value.last_access_number,
+ )
+
+ def policy_name(self):
+ return "lru"
+
+
+class MRUPolicy(Policy):
+ def prioritize_samples(self, samples, auxilliary_info):
+ return sorted(
+ samples,
+ cmp=lambda e1, e2: e2.value.last_access_number
+ - e1.value.last_access_number,
+ )
+
+ def policy_name(self):
+ return "mru"
+
+
+class LFUPolicy(Policy):
+ def prioritize_samples(self, samples, auxilliary_info):
+ return sorted(samples, cmp=lambda e1, e2: e1.value.num_hits - e2.value.num_hits)
+
+ def policy_name(self):
+ return "lfu"
+
+
+class HyperbolicPolicy(Policy):
+ """
+ An implementation of Hyperbolic caching.
+
+ Aaron Blankstein, Siddhartha Sen, and Michael J. Freedman. 2017.
+ Hyperbolic caching: flexible caching for web applications. In Proceedings
+ of the 2017 USENIX Conference on Usenix Annual Technical Conference
+ (USENIX ATC '17). USENIX Association, Berkeley, CA, USA, 499-511.
+ """
+
+ def compare(self, e1, e2, now):
+ e1_duration = max(0, (now - e1.value.insertion_time) / kMicrosInSecond) * float(
+ e1.value.value_size
+ )
+ e2_duration = max(0, (now - e2.value.insertion_time) / kMicrosInSecond) * float(
+ e2.value.value_size
+ )
+ if e1_duration == e2_duration:
+ return e1.value.num_hits - e2.value.num_hits
+ if e1_duration == 0:
+ return 1
+ if e2_duration == 0:
+ return 1
+ diff = (float(e1.value.num_hits) / (float(e1_duration))) - (
+ float(e2.value.num_hits) / float(e2_duration)
+ )
+ if diff == 0:
+ return 0
+ elif diff > 0:
+ return 1
+ else:
+ return -1
+
+ def prioritize_samples(self, samples, auxilliary_info):
+ assert len(auxilliary_info) == 3
+ now = auxilliary_info[0]
+ return sorted(samples, cmp=lambda e1, e2: self.compare(e1, e2, now))
+
+ def policy_name(self):
+ return "hb"
+
+
+class CostClassPolicy(Policy):
+ """
+ We calculate the hit density of a cost class as
+ number of hits / total size in cache * average duration in the cache.
+
+ An entry has a higher priority if its class's hit density is higher.
+ """
+
+ def compare(self, e1, e2, now, cost_classes, cost_class_label):
+ e1_class = e1.value.cost_class(cost_class_label)
+ e2_class = e2.value.cost_class(cost_class_label)
+
+ assert e1_class in cost_classes
+ assert e2_class in cost_classes
+
+ e1_entry = cost_classes[e1_class]
+ e2_entry = cost_classes[e2_class]
+ e1_density = e1_entry.density(now)
+ e2_density = e2_entry.density(now)
+ e1_hits = cost_classes[e1_class].hits
+ e2_hits = cost_classes[e2_class].hits
+
+ if e1_density == e2_density:
+ return e1_hits - e2_hits
+
+ if e1_entry.num_entries_in_cache == 0:
+ return -1
+ if e2_entry.num_entries_in_cache == 0:
+ return 1
+
+ if e1_density == 0:
+ return 1
+ if e2_density == 0:
+ return -1
+ diff = (float(e1_hits) / float(e1_density)) - (
+ float(e2_hits) / float(e2_density)
+ )
+ if diff == 0:
+ return 0
+ elif diff > 0:
+ return 1
+ else:
+ return -1
+
+ def prioritize_samples(self, samples, auxilliary_info):
+ assert len(auxilliary_info) == 3
+ now = auxilliary_info[0]
+ cost_classes = auxilliary_info[1]
+ cost_class_label = auxilliary_info[2]
+ return sorted(
+ samples,
+ cmp=lambda e1, e2: self.compare(
+ e1, e2, now, cost_classes, cost_class_label
+ ),
+ )
+
+ def policy_name(self):
+ return "cc"
+
+
+class Cache(object):
+ """
+ This is the base class for the implementations of alternative cache
+ replacement policies.
+ """
+
+ def __init__(self, cache_size, enable_cache_row_key):
+ self.cache_size = cache_size
+ self.used_size = 0
+ self.per_second_miss_ratio_stats = MissRatioStats(1)
+ self.miss_ratio_stats = MissRatioStats(kSecondsInMinute)
+ self.per_hour_miss_ratio_stats = MissRatioStats(kSecondsInHour)
+ # 0: disabled. 1: enabled. Insert both row and the refereneced data block.
+ # 2: enabled. Insert only the row but NOT the referenced data block.
+ self.enable_cache_row_key = enable_cache_row_key
+ self.get_id_row_key_map = {}
+ self.max_seen_get_id = 0
+ self.retain_get_id_range = 100000
+
+ def block_key(self, trace_record):
+ return "b{}".format(trace_record.block_id)
+
+ def row_key(self, trace_record):
+ return "g{}-{}".format(trace_record.fd, trace_record.key_id)
+
+ def _lookup(self, trace_record, key, hash):
+ """
+ Look up the key in the cache.
+ Returns true upon a cache hit, false otherwise.
+ """
+ raise NotImplementedError
+
+ def _evict(self, trace_record, key, hash, value_size):
+ """
+ Evict entries in the cache until there is enough room to insert the new
+ entry with 'value_size'.
+ """
+ raise NotImplementedError
+
+ def _insert(self, trace_record, key, hash, value_size):
+ """
+ Insert the new entry into the cache.
+ """
+ raise NotImplementedError
+
+ def _should_admit(self, trace_record, key, hash, value_size):
+ """
+ A custom admission policy to decide whether we should admit the new
+ entry upon a cache miss.
+ Returns true if the new entry should be admitted, false otherwise.
+ """
+ raise NotImplementedError
+
+ def cache_name(self):
+ """
+ The name of the replacement policy.
+ """
+ raise NotImplementedError
+
+ def is_ml_cache(self):
+ return False
+
+ def _update_stats(self, access_time, is_hit, miss_bytes):
+ self.per_second_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
+ self.miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
+ self.per_hour_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
+
+ def access(self, trace_record):
+ """
+ Access a trace record. The simulator calls this function to access a
+ trace record.
+ """
+ assert self.used_size <= self.cache_size
+ if (
+ self.enable_cache_row_key > 0
+ and trace_record.caller == 1
+ and trace_record.key_id != 0
+ and trace_record.get_id != 0
+ ):
+ # This is a get request.
+ self._access_row(trace_record)
+ return
+ is_hit = self._access_kv(
+ trace_record,
+ self.block_key(trace_record),
+ trace_record.block_id,
+ trace_record.block_size,
+ trace_record.no_insert,
+ )
+ self._update_stats(
+ trace_record.access_time, is_hit=is_hit, miss_bytes=trace_record.block_size
+ )
+
+ def _access_row(self, trace_record):
+ row_key = self.row_key(trace_record)
+ self.max_seen_get_id = max(self.max_seen_get_id, trace_record.get_id)
+ self.get_id_row_key_map.pop(
+ self.max_seen_get_id - self.retain_get_id_range, None
+ )
+ if trace_record.get_id not in self.get_id_row_key_map:
+ self.get_id_row_key_map[trace_record.get_id] = {}
+ self.get_id_row_key_map[trace_record.get_id]["h"] = False
+ if self.get_id_row_key_map[trace_record.get_id]["h"]:
+ # We treat future accesses as hits since this get request
+ # completes.
+ # print("row hit 1")
+ self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0)
+ return
+ if row_key not in self.get_id_row_key_map[trace_record.get_id]:
+ # First time seen this key.
+ is_hit = self._access_kv(
+ trace_record,
+ key=row_key,
+ hash=trace_record.key_id,
+ value_size=trace_record.kv_size,
+ no_insert=False,
+ )
+ inserted = False
+ if trace_record.kv_size > 0:
+ inserted = True
+ self.get_id_row_key_map[trace_record.get_id][row_key] = inserted
+ self.get_id_row_key_map[trace_record.get_id]["h"] = is_hit
+ if self.get_id_row_key_map[trace_record.get_id]["h"]:
+ # We treat future accesses as hits since this get request
+ # completes.
+ # print("row hit 2")
+ self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0)
+ return
+ # Access its blocks.
+ no_insert = trace_record.no_insert
+ if (
+ self.enable_cache_row_key == 2
+ and trace_record.kv_size > 0
+ and trace_record.block_type == 9
+ ):
+ no_insert = True
+ is_hit = self._access_kv(
+ trace_record,
+ key=self.block_key(trace_record),
+ hash=trace_record.block_id,
+ value_size=trace_record.block_size,
+ no_insert=no_insert,
+ )
+ self._update_stats(
+ trace_record.access_time, is_hit, miss_bytes=trace_record.block_size
+ )
+ if (
+ trace_record.kv_size > 0
+ and not self.get_id_row_key_map[trace_record.get_id][row_key]
+ ):
+ # Insert the row key-value pair.
+ self._access_kv(
+ trace_record,
+ key=row_key,
+ hash=trace_record.key_id,
+ value_size=trace_record.kv_size,
+ no_insert=False,
+ )
+ # Mark as inserted.
+ self.get_id_row_key_map[trace_record.get_id][row_key] = True
+
+ def _access_kv(self, trace_record, key, hash, value_size, no_insert):
+ # Sanity checks.
+ assert self.used_size <= self.cache_size
+ if self._lookup(trace_record, key, hash):
+ # A cache hit.
+ return True
+ if no_insert or value_size <= 0:
+ return False
+ # A cache miss.
+ if value_size > self.cache_size:
+ # The block is too large to fit into the cache.
+ return False
+ self._evict(trace_record, key, hash, value_size)
+ if self._should_admit(trace_record, key, hash, value_size):
+ self._insert(trace_record, key, hash, value_size)
+ self.used_size += value_size
+ return False
+
+
+class CostClassEntry:
+ """
+ A cost class maintains aggregated statistics of cached entries in a class.
+ For example, we may define block type as a class. Then, cached blocks of the
+ same type will share one cost class entry.
+ """
+
+ def __init__(self):
+ self.hits = 0
+ self.num_entries_in_cache = 0
+ self.size_in_cache = 0
+ self.sum_insertion_times = 0
+ self.sum_last_access_time = 0
+
+ def insert(self, trace_record, key, value_size):
+ self.size_in_cache += value_size
+ self.num_entries_in_cache += 1
+ self.sum_insertion_times += trace_record.access_time / kMicrosInSecond
+ self.sum_last_access_time += trace_record.access_time / kMicrosInSecond
+
+ def remove(self, insertion_time, last_access_time, key, value_size, num_hits):
+ self.hits -= num_hits
+ self.num_entries_in_cache -= 1
+ self.sum_insertion_times -= insertion_time / kMicrosInSecond
+ self.size_in_cache -= value_size
+ self.sum_last_access_time -= last_access_time / kMicrosInSecond
+
+ def update_on_hit(self, trace_record, last_access_time):
+ self.hits += 1
+ self.sum_last_access_time -= last_access_time / kMicrosInSecond
+ self.sum_last_access_time += trace_record.access_time / kMicrosInSecond
+
+ def avg_lifetime_in_cache(self, now):
+ avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache
+ return now / kMicrosInSecond - avg_insertion_time
+
+ def avg_last_access_time(self):
+ if self.num_entries_in_cache == 0:
+ return 0
+ return float(self.sum_last_access_time) / float(self.num_entries_in_cache)
+
+ def avg_size(self):
+ if self.num_entries_in_cache == 0:
+ return 0
+ return float(self.sum_last_access_time) / float(self.num_entries_in_cache)
+
+ def density(self, now):
+ avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache
+ in_cache_duration = now / kMicrosInSecond - avg_insertion_time
+ return self.size_in_cache * in_cache_duration
+
+
+class MLCache(Cache):
+ """
+ MLCache is the base class for implementations of alternative replacement
+ policies using reinforcement learning.
+ """
+
+ def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label):
+ super(MLCache, self).__init__(cache_size, enable_cache_row_key)
+ self.table = HashTable()
+ self.policy_stats = PolicyStats(kSecondsInMinute, policies)
+ self.per_hour_policy_stats = PolicyStats(kSecondsInHour, policies)
+ self.policies = policies
+ self.cost_classes = {}
+ self.cost_class_label = cost_class_label
+
+ def is_ml_cache(self):
+ return True
+
+ def _lookup(self, trace_record, key, hash):
+ value = self.table.lookup(key, hash)
+ if value is not None:
+ # Update the entry's cost class statistics.
+ if self.cost_class_label is not None:
+ cost_class = value.cost_class(self.cost_class_label)
+ assert cost_class in self.cost_classes
+ self.cost_classes[cost_class].update_on_hit(
+ trace_record, value.last_access_time
+ )
+ # Update the entry's last access time.
+ self.table.insert(
+ key,
+ hash,
+ CacheEntry(
+ value_size=value.value_size,
+ cf_id=value.cf_id,
+ level=value.level,
+ block_type=value.block_type,
+ table_id=value.table_id,
+ access_number=self.miss_ratio_stats.num_accesses,
+ time_s=trace_record.access_time,
+ num_hits=value.num_hits + 1,
+ ),
+ )
+ return True
+ return False
+
+ def _evict(self, trace_record, key, hash, value_size):
+ # Select a policy, random sample kSampleSize keys from the cache, then
+ # evict keys in the sample set until we have enough room for the new
+ # entry.
+ policy_index = self._select_policy(trace_record, key)
+ assert policy_index < len(self.policies) and policy_index >= 0
+ self.policies[policy_index].delete(key)
+ self.policy_stats.update_metrics(trace_record.access_time, policy_index)
+ self.per_hour_policy_stats.update_metrics(
+ trace_record.access_time, policy_index
+ )
+ while self.used_size + value_size > self.cache_size:
+ # Randomly sample n entries.
+ samples = self.table.random_sample(kSampleSize)
+ samples = self.policies[policy_index].prioritize_samples(
+ samples,
+ [trace_record.access_time, self.cost_classes, self.cost_class_label],
+ )
+ for hash_entry in samples:
+ assert self.table.delete(hash_entry.key, hash_entry.hash) is not None
+ self.used_size -= hash_entry.value.value_size
+ self.policies[policy_index].evict(
+ key=hash_entry.key, max_size=self.table.elements
+ )
+ # Update the entry's cost class statistics.
+ if self.cost_class_label is not None:
+ cost_class = hash_entry.value.cost_class(self.cost_class_label)
+ assert cost_class in self.cost_classes
+ self.cost_classes[cost_class].remove(
+ hash_entry.value.insertion_time,
+ hash_entry.value.last_access_time,
+ key,
+ hash_entry.value.value_size,
+ hash_entry.value.num_hits,
+ )
+ if self.used_size + value_size <= self.cache_size:
+ break
+
+ def _insert(self, trace_record, key, hash, value_size):
+ assert self.used_size + value_size <= self.cache_size
+ entry = CacheEntry(
+ value_size,
+ trace_record.cf_id,
+ trace_record.level,
+ trace_record.block_type,
+ trace_record.table_id,
+ self.miss_ratio_stats.num_accesses,
+ trace_record.access_time,
+ )
+ # Update the entry's cost class statistics.
+ if self.cost_class_label is not None:
+ cost_class = entry.cost_class(self.cost_class_label)
+ if cost_class not in self.cost_classes:
+ self.cost_classes[cost_class] = CostClassEntry()
+ self.cost_classes[cost_class].insert(trace_record, key, value_size)
+ self.table.insert(key, hash, entry)
+
+ def _should_admit(self, trace_record, key, hash, value_size):
+ return True
+
+ def _select_policy(self, trace_record, key):
+ raise NotImplementedError
+
+
+class ThompsonSamplingCache(MLCache):
+ """
+ An implementation of Thompson Sampling for the Bernoulli Bandit.
+
+ Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband,
+ and Zheng Wen. 2018. A Tutorial on Thompson Sampling. Found.
+ Trends Mach. Learn. 11, 1 (July 2018), 1-96.
+ DOI: https://doi.org/10.1561/2200000070
+ """
+
+ def __init__(
+ self,
+ cache_size,
+ enable_cache_row_key,
+ policies,
+ cost_class_label,
+ init_a=1,
+ init_b=1,
+ ):
+ super(ThompsonSamplingCache, self).__init__(
+ cache_size, enable_cache_row_key, policies, cost_class_label
+ )
+ self._as = {}
+ self._bs = {}
+ for _i in range(len(policies)):
+ self._as = [init_a] * len(self.policies)
+ self._bs = [init_b] * len(self.policies)
+
+ def _select_policy(self, trace_record, key):
+ if len(self.policies) == 1:
+ return 0
+ samples = [
+ np.random.beta(self._as[x], self._bs[x]) for x in range(len(self.policies))
+ ]
+ selected_policy = max(range(len(self.policies)), key=lambda x: samples[x])
+ reward = self.policies[selected_policy].generate_reward(key)
+ assert reward <= 1 and reward >= 0
+ self._as[selected_policy] += reward
+ self._bs[selected_policy] += 1 - reward
+ return selected_policy
+
+ def cache_name(self):
+ if self.enable_cache_row_key:
+ return "Hybrid ThompsonSampling with cost class {} (ts_hybrid)".format(
+ self.cost_class_label
+ )
+ return "ThompsonSampling with cost class {} (ts)".format(self.cost_class_label)
+
+
+class LinUCBCache(MLCache):
+ """
+ An implementation of LinUCB with disjoint linear models.
+
+ Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010.
+ A contextual-bandit approach to personalized news article recommendation.
+ In Proceedings of the 19th international conference on World wide web
+ (WWW '10). ACM, New York, NY, USA, 661-670.
+ DOI=http://dx.doi.org/10.1145/1772690.1772758
+ """
+
+ def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label):
+ super(LinUCBCache, self).__init__(
+ cache_size, enable_cache_row_key, policies, cost_class_label
+ )
+ self.nfeatures = 4 # Block type, level, cf.
+ self.th = np.zeros((len(self.policies), self.nfeatures))
+ self.eps = 0.2
+ self.b = np.zeros_like(self.th)
+ self.A = np.zeros((len(self.policies), self.nfeatures, self.nfeatures))
+ self.A_inv = np.zeros((len(self.policies), self.nfeatures, self.nfeatures))
+ for i in range(len(self.policies)):
+ self.A[i] = np.identity(self.nfeatures)
+ self.th_hat = np.zeros_like(self.th)
+ self.p = np.zeros(len(self.policies))
+ self.alph = 0.2
+
+ def _select_policy(self, trace_record, key):
+ if len(self.policies) == 1:
+ return 0
+ x_i = np.zeros(self.nfeatures) # The current context vector
+ x_i[0] = trace_record.block_type
+ x_i[1] = trace_record.level
+ x_i[2] = trace_record.cf_id
+ p = np.zeros(len(self.policies))
+ for a in range(len(self.policies)):
+ self.th_hat[a] = self.A_inv[a].dot(self.b[a])
+ ta = x_i.dot(self.A_inv[a]).dot(x_i)
+ a_upper_ci = self.alph * np.sqrt(ta)
+ a_mean = self.th_hat[a].dot(x_i)
+ p[a] = a_mean + a_upper_ci
+ p = p + (np.random.random(len(p)) * 0.000001)
+ selected_policy = p.argmax()
+ reward = self.policies[selected_policy].generate_reward(key)
+ assert reward <= 1 and reward >= 0
+ self.A[selected_policy] += np.outer(x_i, x_i)
+ self.b[selected_policy] += reward * x_i
+ self.A_inv[selected_policy] = np.linalg.inv(self.A[selected_policy])
+ del x_i
+ return selected_policy
+
+ def cache_name(self):
+ if self.enable_cache_row_key:
+ return "Hybrid LinUCB with cost class {} (linucb_hybrid)".format(
+ self.cost_class_label
+ )
+ return "LinUCB with cost class {} (linucb)".format(self.cost_class_label)
+
+
+class OPTCacheEntry:
+ """
+ A cache entry for the OPT algorithm. The entries are sorted based on its
+ next access sequence number in reverse order, i.e., the entry which next
+ access is the furthest in the future is ordered before other entries.
+ """
+
+ def __init__(self, key, next_access_seq_no, value_size):
+ self.key = key
+ self.next_access_seq_no = next_access_seq_no
+ self.value_size = value_size
+ self.is_removed = False
+
+ def __cmp__(self, other):
+ if other.next_access_seq_no != self.next_access_seq_no:
+ return other.next_access_seq_no - self.next_access_seq_no
+ return self.value_size - other.value_size
+
+ def __repr__(self):
+ return "({} {} {} {})".format(
+ self.key, self.next_access_seq_no, self.value_size, self.is_removed
+ )
+
+
+class PQTable:
+ """
+ A hash table with a priority queue.
+ """
+
+ def __init__(self):
+ # A list of entries arranged in a heap sorted based on the entry custom
+ # implementation of __cmp__
+ self.pq = []
+ self.table = {}
+
+ def pqinsert(self, entry):
+ "Add a new key or update the priority of an existing key"
+ # Remove the entry from the table first.
+ removed_entry = self.table.pop(entry.key, None)
+ if removed_entry:
+ # Mark as removed since there is no 'remove' API in heappq.
+ # Instead, an entry in pq is removed lazily when calling pop.
+ removed_entry.is_removed = True
+ self.table[entry.key] = entry
+ heapq.heappush(self.pq, entry)
+ return removed_entry
+
+ def pqpop(self):
+ while self.pq:
+ entry = heapq.heappop(self.pq)
+ if not entry.is_removed:
+ del self.table[entry.key]
+ return entry
+ return None
+
+ def pqpeek(self):
+ while self.pq:
+ entry = self.pq[0]
+ if not entry.is_removed:
+ return entry
+ heapq.heappop(self.pq)
+ return
+
+ def __contains__(self, k):
+ return k in self.table
+
+ def __getitem__(self, k):
+ return self.table[k]
+
+ def __len__(self):
+ return len(self.table)
+
+ def values(self):
+ return self.table.values()
+
+
+class OPTCache(Cache):
+ """
+ An implementation of the Belady MIN algorithm. OPTCache evicts an entry
+ in the cache whose next access occurs furthest in the future.
+
+ Note that Belady MIN algorithm is optimal assuming all blocks having the
+ same size and a missing entry will be inserted in the cache.
+ These are NOT true for the block cache trace since blocks have different
+ sizes and we may not insert a block into the cache upon a cache miss.
+ However, it is still useful to serve as a "theoretical upper bound" on the
+ lowest miss ratio we can achieve given a cache size.
+
+ L. A. Belady. 1966. A Study of Replacement Algorithms for a
+ Virtual-storage Computer. IBM Syst. J. 5, 2 (June 1966), 78-101.
+ DOI=http://dx.doi.org/10.1147/sj.52.0078
+ """
+
+ def __init__(self, cache_size):
+ super(OPTCache, self).__init__(cache_size, enable_cache_row_key=0)
+ self.table = PQTable()
+
+ def _lookup(self, trace_record, key, hash):
+ if key not in self.table:
+ return False
+ # A cache hit. Update its next access time.
+ assert (
+ self.table.pqinsert(
+ OPTCacheEntry(
+ key, trace_record.next_access_seq_no, self.table[key].value_size
+ )
+ )
+ is not None
+ )
+ return True
+
+ def _evict(self, trace_record, key, hash, value_size):
+ while self.used_size + value_size > self.cache_size:
+ evict_entry = self.table.pqpop()
+ assert evict_entry is not None
+ self.used_size -= evict_entry.value_size
+
+ def _insert(self, trace_record, key, hash, value_size):
+ assert (
+ self.table.pqinsert(
+ OPTCacheEntry(key, trace_record.next_access_seq_no, value_size)
+ )
+ is None
+ )
+
+ def _should_admit(self, trace_record, key, hash, value_size):
+ return True
+
+ def cache_name(self):
+ return "Belady MIN (opt)"
+
+
+class GDSizeEntry:
+ """
+ A cache entry for the greedy dual size replacement policy.
+ """
+
+ def __init__(self, key, value_size, priority):
+ self.key = key
+ self.value_size = value_size
+ self.priority = priority
+ self.is_removed = False
+
+ def __cmp__(self, other):
+ if other.priority != self.priority:
+ return self.priority - other.priority
+ return self.value_size - other.value_size
+
+ def __repr__(self):
+ return "({} {} {} {})".format(
+ self.key, self.next_access_seq_no, self.value_size, self.is_removed
+ )
+
+
+class GDSizeCache(Cache):
+ """
+ An implementation of the greedy dual size algorithm.
+ We define cost as an entry's size.
+
+ See https://www.usenix.org/legacy/publications/library/proceedings/usits97/full_papers/cao/cao_html/node8.html
+ and N. Young. The k-server dual and loose competitiveness for paging.
+ Algorithmica,June 1994, vol. 11,(no.6):525-41.
+ Rewritten version of ''On-line caching as cache size varies'',
+ in The 2nd Annual ACM-SIAM Symposium on Discrete Algorithms, 241-250, 1991.
+ """
+
+ def __init__(self, cache_size, enable_cache_row_key):
+ super(GDSizeCache, self).__init__(cache_size, enable_cache_row_key)
+ self.table = PQTable()
+ self.L = 0.0
+
+ def cache_name(self):
+ if self.enable_cache_row_key:
+ return "Hybrid GreedyDualSize (gdsize_hybrid)"
+ return "GreedyDualSize (gdsize)"
+
+ def _lookup(self, trace_record, key, hash):
+ if key not in self.table:
+ return False
+ # A cache hit. Update its priority.
+ entry = self.table[key]
+ assert (
+ self.table.pqinsert(
+ GDSizeEntry(key, entry.value_size, self.L + entry.value_size)
+ )
+ is not None
+ )
+ return True
+
+ def _evict(self, trace_record, key, hash, value_size):
+ while self.used_size + value_size > self.cache_size:
+ evict_entry = self.table.pqpop()
+ assert evict_entry is not None
+ self.L = evict_entry.priority
+ self.used_size -= evict_entry.value_size
+
+ def _insert(self, trace_record, key, hash, value_size):
+ assert (
+ self.table.pqinsert(GDSizeEntry(key, value_size, self.L + value_size))
+ is None
+ )
+
+ def _should_admit(self, trace_record, key, hash, value_size):
+ return True
+
+
+class Deque(object):
+ """A Deque class facilitates the implementation of LRU and ARC."""
+
+ def __init__(self):
+ self.od = OrderedDict()
+
+ def appendleft(self, k):
+ if k in self.od:
+ del self.od[k]
+ self.od[k] = None
+
+ def pop(self):
+ item = self.od.popitem(last=False) if self.od else None
+ if item is not None:
+ return item[0]
+ return None
+
+ def remove(self, k):
+ del self.od[k]
+
+ def __len__(self):
+ return len(self.od)
+
+ def __contains__(self, k):
+ return k in self.od
+
+ def __iter__(self):
+ return reversed(self.od)
+
+ def __repr__(self):
+ return "Deque(%r)" % (list(self),)
+
+
+class ARCCache(Cache):
+ """
+ An implementation of ARC. ARC assumes that all blocks are having the
+ same size. The size of index and filter blocks are variable. To accommodate
+ this, we modified ARC as follows:
+ 1) We use 16 KB as the average block size and calculate the number of blocks
+ (c) in the cache.
+ 2) When we insert an entry, the cache evicts entries in both t1 and t2
+ queues until it has enough space for the new entry. This also requires
+ modification of the algorithm to maintain a maximum of 2*c blocks.
+
+ Nimrod Megiddo and Dharmendra S. Modha. 2003. ARC: A Self-Tuning, Low
+ Overhead Replacement Cache. In Proceedings of the 2nd USENIX Conference on
+ File and Storage Technologies (FAST '03). USENIX Association, Berkeley, CA,
+ USA, 115-130.
+ """
+
+ def __init__(self, cache_size, enable_cache_row_key):
+ super(ARCCache, self).__init__(cache_size, enable_cache_row_key)
+ self.table = {}
+ self.c = cache_size / 16 * 1024 # Number of elements in the cache.
+ self.p = 0 # Target size for the list T1
+ # L1: only once recently
+ self.t1 = Deque() # T1: recent cache entries
+ self.b1 = Deque() # B1: ghost entries recently evicted from the T1 cache
+ # L2: at least twice recently
+ self.t2 = Deque() # T2: frequent entries
+ self.b2 = Deque() # B2: ghost entries recently evicted from the T2 cache
+
+ def _replace(self, key, value_size):
+ while self.used_size + value_size > self.cache_size:
+ if self.t1 and ((key in self.b2) or (len(self.t1) > self.p)):
+ old = self.t1.pop()
+ self.b1.appendleft(old)
+ else:
+ if self.t2:
+ old = self.t2.pop()
+ self.b2.appendleft(old)
+ else:
+ old = self.t1.pop()
+ self.b1.appendleft(old)
+ self.used_size -= self.table[old].value_size
+ del self.table[old]
+
+ def _lookup(self, trace_record, key, hash):
+ # Case I: key is in T1 or T2.
+ # Move key to MRU position in T2.
+ if key in self.t1:
+ self.t1.remove(key)
+ self.t2.appendleft(key)
+ return True
+
+ if key in self.t2:
+ self.t2.remove(key)
+ self.t2.appendleft(key)
+ return True
+ return False
+
+ def _evict(self, trace_record, key, hash, value_size):
+ # Case II: key is in B1
+ # Move x from B1 to the MRU position in T2 (also fetch x to the cache).
+ if key in self.b1:
+ self.p = min(self.c, self.p + max(len(self.b2) / len(self.b1), 1))
+ self._replace(key, value_size)
+ self.b1.remove(key)
+ self.t2.appendleft(key)
+ return
+
+ # Case III: key is in B2
+ # Move x from B2 to the MRU position in T2 (also fetch x to the cache).
+ if key in self.b2:
+ self.p = max(0, self.p - max(len(self.b1) / len(self.b2), 1))
+ self._replace(key, value_size)
+ self.b2.remove(key)
+ self.t2.appendleft(key)
+ return
+
+ # Case IV: key is not in (T1 u B1 u T2 u B2)
+ self._replace(key, value_size)
+ while len(self.t1) + len(self.b1) >= self.c and self.b1:
+ self.b1.pop()
+
+ total = len(self.t1) + len(self.b1) + len(self.t2) + len(self.b2)
+ while total >= (2 * self.c) and self.b2:
+ self.b2.pop()
+ total -= 1
+ # Finally, move it to MRU position in T1.
+ self.t1.appendleft(key)
+ return
+
+ def _insert(self, trace_record, key, hash, value_size):
+ self.table[key] = CacheEntry(
+ value_size,
+ trace_record.cf_id,
+ trace_record.level,
+ trace_record.block_type,
+ trace_record.table_id,
+ 0,
+ trace_record.access_time,
+ )
+
+ def _should_admit(self, trace_record, key, hash, value_size):
+ return True
+
+ def cache_name(self):
+ if self.enable_cache_row_key:
+ return "Hybrid Adaptive Replacement Cache (arc_hybrid)"
+ return "Adaptive Replacement Cache (arc)"
+
+
+class LRUCache(Cache):
+ """
+ A strict LRU queue.
+ """
+
+ def __init__(self, cache_size, enable_cache_row_key):
+ super(LRUCache, self).__init__(cache_size, enable_cache_row_key)
+ self.table = {}
+ self.lru = Deque()
+
+ def cache_name(self):
+ if self.enable_cache_row_key:
+ return "Hybrid LRU (lru_hybrid)"
+ return "LRU (lru)"
+
+ def _lookup(self, trace_record, key, hash):
+ if key not in self.table:
+ return False
+ # A cache hit. Update LRU queue.
+ self.lru.remove(key)
+ self.lru.appendleft(key)
+ return True
+
+ def _evict(self, trace_record, key, hash, value_size):
+ while self.used_size + value_size > self.cache_size:
+ evict_key = self.lru.pop()
+ self.used_size -= self.table[evict_key].value_size
+ del self.table[evict_key]
+
+ def _insert(self, trace_record, key, hash, value_size):
+ self.table[key] = CacheEntry(
+ value_size,
+ trace_record.cf_id,
+ trace_record.level,
+ trace_record.block_type,
+ trace_record.table_id,
+ 0,
+ trace_record.access_time,
+ )
+ self.lru.appendleft(key)
+
+ def _should_admit(self, trace_record, key, hash, value_size):
+ return True
+
+
+class TraceCache(Cache):
+ """
+ A trace cache. Lookup returns true if the trace observes a cache hit.
+ It is used to maintain cache hits observed in the trace.
+ """
+
+ def __init__(self, cache_size):
+ super(TraceCache, self).__init__(cache_size, enable_cache_row_key=0)
+
+ def _lookup(self, trace_record, key, hash):
+ return trace_record.is_hit
+
+ def _evict(self, trace_record, key, hash, value_size):
+ pass
+
+ def _insert(self, trace_record, key, hash, value_size):
+ pass
+
+ def _should_admit(self, trace_record, key, hash, value_size):
+ return False
+
+ def cache_name(self):
+ return "Trace"
+
+
+def parse_cache_size(cs):
+ cs = cs.replace("\n", "")
+ if cs[-1] == "M":
+ return int(cs[: len(cs) - 1]) * 1024 * 1024
+ if cs[-1] == "G":
+ return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024
+ if cs[-1] == "T":
+ return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024 * 1024
+ return int(cs)
+
+
+def create_cache(cache_type, cache_size, downsample_size):
+ cache_size = cache_size / downsample_size
+ enable_cache_row_key = 0
+ if "hybridn" in cache_type:
+ enable_cache_row_key = 2
+ cache_type = cache_type[:-8]
+ if "hybrid" in cache_type:
+ enable_cache_row_key = 1
+ cache_type = cache_type[:-7]
+ if cache_type == "ts":
+ return ThompsonSamplingCache(
+ cache_size,
+ enable_cache_row_key,
+ [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()],
+ cost_class_label=None,
+ )
+ elif cache_type == "linucb":
+ return LinUCBCache(
+ cache_size,
+ enable_cache_row_key,
+ [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()],
+ cost_class_label=None,
+ )
+ elif cache_type == "pylru":
+ return ThompsonSamplingCache(
+ cache_size, enable_cache_row_key, [LRUPolicy()], cost_class_label=None
+ )
+ elif cache_type == "pymru":
+ return ThompsonSamplingCache(
+ cache_size, enable_cache_row_key, [MRUPolicy()], cost_class_label=None
+ )
+ elif cache_type == "pylfu":
+ return ThompsonSamplingCache(
+ cache_size, enable_cache_row_key, [LFUPolicy()], cost_class_label=None
+ )
+ elif cache_type == "pyhb":
+ return ThompsonSamplingCache(
+ cache_size,
+ enable_cache_row_key,
+ [HyperbolicPolicy()],
+ cost_class_label=None,
+ )
+ elif cache_type == "pycctbbt":
+ return ThompsonSamplingCache(
+ cache_size,
+ enable_cache_row_key,
+ [CostClassPolicy()],
+ cost_class_label="table_bt",
+ )
+ elif cache_type == "pycccf":
+ return ThompsonSamplingCache(
+ cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="cf"
+ )
+ elif cache_type == "pycctblevelbt":
+ return ThompsonSamplingCache(
+ cache_size,
+ enable_cache_row_key,
+ [CostClassPolicy()],
+ cost_class_label="table_level_bt",
+ )
+ elif cache_type == "pycccfbt":
+ return ThompsonSamplingCache(
+ cache_size,
+ enable_cache_row_key,
+ [CostClassPolicy()],
+ cost_class_label="cf_bt",
+ )
+ elif cache_type == "pycctb":
+ return ThompsonSamplingCache(
+ cache_size,
+ enable_cache_row_key,
+ [CostClassPolicy()],
+ cost_class_label="table",
+ )
+ elif cache_type == "pyccbt":
+ return ThompsonSamplingCache(
+ cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="bt"
+ )
+ elif cache_type == "opt":
+ if enable_cache_row_key:
+ print("opt does not support hybrid mode.")
+ assert False
+ return OPTCache(cache_size)
+ elif cache_type == "trace":
+ if enable_cache_row_key:
+ print("trace does not support hybrid mode.")
+ assert False
+ return TraceCache(cache_size)
+ elif cache_type == "lru":
+ return LRUCache(cache_size, enable_cache_row_key)
+ elif cache_type == "arc":
+ return ARCCache(cache_size, enable_cache_row_key)
+ elif cache_type == "gdsize":
+ return GDSizeCache(cache_size, enable_cache_row_key)
+ else:
+ print("Unknown cache type {}".format(cache_type))
+ assert False
+ return None
+
+
+class BlockAccessTimeline:
+ """
+ BlockAccessTimeline stores all accesses of a block.
+ """
+
+ def __init__(self):
+ self.accesses = []
+ self.current_access_index = 1
+
+ def get_next_access(self):
+ if self.current_access_index == len(self.accesses):
+ return sys.maxsize
+ next_access_seq_no = self.accesses[self.current_access_index]
+ self.current_access_index += 1
+ return next_access_seq_no
+
+
+def percent(e1, e2):
+ if e2 == 0:
+ return -1
+ return float(e1) * 100.0 / float(e2)
+
+
+def is_target_cf(access_cf, target_cf_name):
+ if target_cf_name == "all":
+ return True
+ return access_cf == target_cf_name
+
+
+def run(
+ trace_file_path,
+ cache_type,
+ cache,
+ warmup_seconds,
+ max_accesses_to_process,
+ target_cf_name,
+):
+ warmup_complete = False
+ trace_miss_ratio_stats = MissRatioStats(kSecondsInMinute)
+ access_seq_no = 0
+ time_interval = 1
+ start_time = time.time()
+ trace_start_time = 0
+ trace_duration = 0
+ is_opt_cache = False
+ if cache.cache_name() == "Belady MIN (opt)":
+ is_opt_cache = True
+
+ block_access_timelines = {}
+ num_no_inserts = 0
+ num_blocks_with_no_size = 0
+ num_inserts_block_with_no_size = 0
+
+ if is_opt_cache:
+ # Read all blocks in memory and stores their access times so that OPT
+ # can use this information to evict the cached key which next access is
+ # the furthest in the future.
+ print("Preprocessing block traces.")
+ with open(trace_file_path, "r") as trace_file:
+ for line in trace_file:
+ if (
+ max_accesses_to_process != -1
+ and access_seq_no > max_accesses_to_process
+ ):
+ break
+ ts = line.split(",")
+ timestamp = int(ts[0])
+ cf_name = ts[5]
+ if not is_target_cf(cf_name, target_cf_name):
+ continue
+ if trace_start_time == 0:
+ trace_start_time = timestamp
+ trace_duration = timestamp - trace_start_time
+ block_id = int(ts[1])
+ block_size = int(ts[3])
+ no_insert = int(ts[9])
+ if block_id not in block_access_timelines:
+ block_access_timelines[block_id] = BlockAccessTimeline()
+ if block_size == 0:
+ num_blocks_with_no_size += 1
+ block_access_timelines[block_id].accesses.append(access_seq_no)
+ access_seq_no += 1
+ if no_insert == 1:
+ num_no_inserts += 1
+ if no_insert == 0 and block_size == 0:
+ num_inserts_block_with_no_size += 1
+ if access_seq_no % 100 != 0:
+ continue
+ now = time.time()
+ if now - start_time > time_interval * 10:
+ print(
+ "Take {} seconds to process {} trace records with trace "
+ "duration of {} seconds. Throughput: {} records/second.".format(
+ now - start_time,
+ access_seq_no,
+ trace_duration / 1000000,
+ access_seq_no / (now - start_time),
+ )
+ )
+ time_interval += 1
+ print(
+ "Trace contains {0} blocks, {1}({2:.2f}%) blocks with no size."
+ "{3} accesses, {4}({5:.2f}%) accesses with no_insert,"
+ "{6}({7:.2f}%) accesses that want to insert but block size is 0.".format(
+ len(block_access_timelines),
+ num_blocks_with_no_size,
+ percent(num_blocks_with_no_size, len(block_access_timelines)),
+ access_seq_no,
+ num_no_inserts,
+ percent(num_no_inserts, access_seq_no),
+ num_inserts_block_with_no_size,
+ percent(num_inserts_block_with_no_size, access_seq_no),
+ )
+ )
+
+ access_seq_no = 0
+ time_interval = 1
+ start_time = time.time()
+ trace_start_time = 0
+ trace_duration = 0
+ print("Running simulated {} cache on block traces.".format(cache.cache_name()))
+ with open(trace_file_path, "r") as trace_file:
+ for line in trace_file:
+ if (
+ max_accesses_to_process != -1
+ and access_seq_no > max_accesses_to_process
+ ):
+ break
+ if access_seq_no % 1000000 == 0:
+ # Force a python gc periodically to reduce memory usage.
+ gc.collect()
+ ts = line.split(",")
+ timestamp = int(ts[0])
+ cf_name = ts[5]
+ if not is_target_cf(cf_name, target_cf_name):
+ continue
+ if trace_start_time == 0:
+ trace_start_time = timestamp
+ trace_duration = timestamp - trace_start_time
+ if (
+ not warmup_complete
+ and warmup_seconds > 0
+ and trace_duration > warmup_seconds * 1000000
+ ):
+ cache.miss_ratio_stats.reset_counter()
+ warmup_complete = True
+ next_access_seq_no = 0
+ block_id = int(ts[1])
+ if is_opt_cache:
+ next_access_seq_no = block_access_timelines[block_id].get_next_access()
+ record = TraceRecord(
+ access_time=int(ts[0]),
+ block_id=int(ts[1]),
+ block_type=int(ts[2]),
+ block_size=int(ts[3]),
+ cf_id=int(ts[4]),
+ cf_name=ts[5],
+ level=int(ts[6]),
+ fd=int(ts[7]),
+ caller=int(ts[8]),
+ no_insert=int(ts[9]),
+ get_id=int(ts[10]),
+ key_id=int(ts[11]),
+ kv_size=int(ts[12]),
+ is_hit=int(ts[13]),
+ referenced_key_exist_in_block=int(ts[14]),
+ num_keys_in_block=int(ts[15]),
+ table_id=int(ts[16]),
+ seq_number=int(ts[17]),
+ block_key_size=int(ts[18]),
+ key_size=int(ts[19]),
+ block_offset_in_file=int(ts[20]),
+ next_access_seq_no=next_access_seq_no,
+ )
+ trace_miss_ratio_stats.update_metrics(
+ record.access_time, is_hit=record.is_hit, miss_bytes=record.block_size
+ )
+ cache.access(record)
+ access_seq_no += 1
+ del record
+ del ts
+ if access_seq_no % 100 != 0:
+ continue
+ # Report progress every 10 seconds.
+ now = time.time()
+ if now - start_time > time_interval * 10:
+ print(
+ "Take {} seconds to process {} trace records with trace "
+ "duration of {} seconds. Throughput: {} records/second. "
+ "Trace miss ratio {}".format(
+ now - start_time,
+ access_seq_no,
+ trace_duration / 1000000,
+ access_seq_no / (now - start_time),
+ trace_miss_ratio_stats.miss_ratio(),
+ )
+ )
+ time_interval += 1
+ print(
+ "{},0,0,{},{},{}".format(
+ cache_type,
+ cache.cache_size,
+ cache.miss_ratio_stats.miss_ratio(),
+ cache.miss_ratio_stats.num_accesses,
+ )
+ )
+ now = time.time()
+ print(
+ "Take {} seconds to process {} trace records with trace duration of {} "
+ "seconds. Throughput: {} records/second. Trace miss ratio {}".format(
+ now - start_time,
+ access_seq_no,
+ trace_duration / 1000000,
+ access_seq_no / (now - start_time),
+ trace_miss_ratio_stats.miss_ratio(),
+ )
+ )
+ print(
+ "{},0,0,{},{},{}".format(
+ cache_type,
+ cache.cache_size,
+ cache.miss_ratio_stats.miss_ratio(),
+ cache.miss_ratio_stats.num_accesses,
+ )
+ )
+ return trace_start_time, trace_duration
+
+
+def report_stats(
+ cache,
+ cache_type,
+ cache_size,
+ target_cf_name,
+ result_dir,
+ trace_start_time,
+ trace_end_time,
+):
+ cache_label = "{}-{}-{}".format(cache_type, cache_size, target_cf_name)
+ with open("{}/data-ml-mrc-{}".format(result_dir, cache_label), "w+") as mrc_file:
+ mrc_file.write(
+ "{},0,0,{},{},{}\n".format(
+ cache_type,
+ cache_size,
+ cache.miss_ratio_stats.miss_ratio(),
+ cache.miss_ratio_stats.num_accesses,
+ )
+ )
+
+ cache_stats = [
+ cache.per_second_miss_ratio_stats,
+ cache.miss_ratio_stats,
+ cache.per_hour_miss_ratio_stats,
+ ]
+ for i in range(len(cache_stats)):
+ avg_miss_bytes, p95_miss_bytes = cache_stats[i].compute_miss_bytes()
+
+ with open(
+ "{}/data-ml-avgmb-{}-{}".format(
+ result_dir, cache_stats[i].time_unit, cache_label
+ ),
+ "w+",
+ ) as mb_file:
+ mb_file.write(
+ "{},0,0,{},{}\n".format(cache_type, cache_size, avg_miss_bytes)
+ )
+
+ with open(
+ "{}/data-ml-p95mb-{}-{}".format(
+ result_dir, cache_stats[i].time_unit, cache_label
+ ),
+ "w+",
+ ) as mb_file:
+ mb_file.write(
+ "{},0,0,{},{}\n".format(cache_type, cache_size, p95_miss_bytes)
+ )
+
+ cache_stats[i].write_miss_timeline(
+ cache_type,
+ cache_size,
+ target_cf_name,
+ result_dir,
+ trace_start_time,
+ trace_end_time,
+ )
+ cache_stats[i].write_miss_ratio_timeline(
+ cache_type,
+ cache_size,
+ target_cf_name,
+ result_dir,
+ trace_start_time,
+ trace_end_time,
+ )
+
+ if not cache.is_ml_cache():
+ return
+
+ policy_stats = [cache.policy_stats, cache.per_hour_policy_stats]
+ for i in range(len(policy_stats)):
+ policy_stats[i].write_policy_timeline(
+ cache_type,
+ cache_size,
+ target_cf_name,
+ result_dir,
+ trace_start_time,
+ trace_end_time,
+ )
+ policy_stats[i].write_policy_ratio_timeline(
+ cache_type,
+ cache_size,
+ target_cf_name,
+ result_dir,
+ trace_start_time,
+ trace_end_time,
+ )
+
+
+if __name__ == "__main__":
+ if len(sys.argv) <= 8:
+ print(
+ "Must provide 8 arguments.\n"
+ "1) Cache type (ts, linucb, arc, lru, opt, pylru, pymru, pylfu, "
+ "pyhb, gdsize, trace). One may evaluate the hybrid row_block cache "
+ "by appending '_hybrid' to a cache_type, e.g., ts_hybrid. "
+ "Note that hybrid is not supported with opt and trace. \n"
+ "2) Cache size (xM, xG, xT).\n"
+ "3) The sampling frequency used to collect the trace. (The "
+ "simulation scales down the cache size by the sampling frequency).\n"
+ "4) Warmup seconds (The number of seconds used for warmup).\n"
+ "5) Trace file path.\n"
+ "6) Result directory (A directory that saves generated results)\n"
+ "7) Max number of accesses to process\n"
+ "8) The target column family. (The simulation will only run "
+ "accesses on the target column family. If it is set to all, "
+ "it will run against all accesses.)"
+ )
+ exit(1)
+ print("Arguments: {}".format(sys.argv))
+ cache_type = sys.argv[1]
+ cache_size = parse_cache_size(sys.argv[2])
+ downsample_size = int(sys.argv[3])
+ warmup_seconds = int(sys.argv[4])
+ trace_file_path = sys.argv[5]
+ result_dir = sys.argv[6]
+ max_accesses_to_process = int(sys.argv[7])
+ target_cf_name = sys.argv[8]
+ cache = create_cache(cache_type, cache_size, downsample_size)
+ trace_start_time, trace_duration = run(
+ trace_file_path,
+ cache_type,
+ cache,
+ warmup_seconds,
+ max_accesses_to_process,
+ target_cf_name,
+ )
+ trace_end_time = trace_start_time + trace_duration
+ report_stats(
+ cache,
+ cache_type,
+ cache_size,
+ target_cf_name,
+ result_dir,
+ trace_start_time,
+ trace_end_time,
+ )
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh
new file mode 100644
index 000000000..295f734aa
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh
@@ -0,0 +1,156 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# A shell script to run a batch of pysims and combine individual pysim output files.
+#
+# Usage: bash block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs
+# trace_file_path: The file path that stores the traces.
+# result_dir: The directory to store pysim results. The output files from a pysim is stores in result_dir/ml
+# downsample_size: The downsample size used to collect the trace.
+# warmup_seconds: The number of seconds used for warmup.
+# max_jobs: The max number of concurrent pysims to run.
+
+# Install required packages to run simulations.
+# sudo dnf install -y numpy scipy python-matplotlib ipython python-pandas sympy python-nose atlas-devel
+ulimit -c 0
+
+if [ $# -ne 5 ]; then
+ echo "Usage: ./block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs"
+ exit 0
+fi
+
+trace_file="$1"
+result_dir="$2"
+downsample_size="$3"
+warmup_seconds="$4"
+max_jobs="$5"
+max_num_accesses=100000000
+current_jobs=1
+
+ml_tmp_result_dir="$result_dir/ml"
+rm -rf "$ml_tmp_result_dir"
+mkdir -p "$result_dir"
+mkdir -p "$ml_tmp_result_dir"
+
+# Report miss ratio in the trace.
+current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
+for cf_name in "all"
+do
+for cache_size in "1G" "2G" "4G" "8G" "16G" #"12G" "16G" "1T"
+do
+for cache_type in "opt" "lru" "pylru" "pycctbbt" "pyhb" "ts" "trace" "lru_hybrid" #"pycctblevelbt" #"lru_hybridn" "opt" #"pylru" "pylru_hybrid" "pycctbbt" "pycccfbt" "trace"
+do
+ if [[ $cache_type == "trace" && $cache_size != "16G" ]]; then
+ # We only need to collect miss ratios observed in the trace once.
+ continue
+ fi
+ while [ "$current_jobs" -ge "$max_jobs" ]
+ do
+ sleep 10
+ echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+ current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
+ echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+ done
+ output="log-ml-$cache_type-$cache_size-$cf_name"
+ echo "Running simulation for $cache_type, cache size $cache_size, and cf_name $cf_name. Number of running jobs: $current_jobs. "
+ nohup python block_cache_pysim.py "$cache_type" "$cache_size" "$downsample_size" "$warmup_seconds" "$trace_file" "$ml_tmp_result_dir" "$max_num_accesses" "$cf_name" >& "$ml_tmp_result_dir/$output" &
+ current_jobs=$((current_jobs+1))
+done
+done
+done
+
+# Wait for all jobs to complete.
+while [ $current_jobs -gt 0 ]
+do
+ sleep 10
+ echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+ current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
+ echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+done
+
+echo "Combine individual pysim output files"
+
+rm -rf "$result_dir/ml_*"
+for header in "header-" "data-"
+do
+for fn in "$ml_tmp_result_dir"/*
+do
+ sum_file=""
+ time_unit=""
+ capacity=""
+ target_cf_name=""
+ if [[ $fn == *"timeline"* ]]; then
+ tmpfn="$fn"
+ IFS='-' read -ra elements <<< "$tmpfn"
+ time_unit_index=0
+ capacity_index=0
+ for i in "${elements[@]}"
+ do
+ if [[ $i == "timeline" ]]; then
+ break
+ fi
+ time_unit_index=$((time_unit_index+1))
+ done
+ time_unit_index=$((time_unit_index+1))
+ capacity_index=$((time_unit_index+2))
+ target_cf_name_index=$((time_unit_index+3))
+ time_unit="${elements[$time_unit_index]}_"
+ capacity="${elements[$capacity_index]}_"
+ target_cf_name="${elements[$target_cf_name_index]}_"
+ fi
+
+ if [[ $fn == *"${header}ml-policy-timeline"* ]]; then
+ sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_timeline"
+ fi
+ if [[ $fn == *"${header}ml-policy-ratio-timeline"* ]]; then
+ sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_ratio_timeline"
+ fi
+ if [[ $fn == *"${header}ml-miss-timeline"* ]]; then
+ sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_timeline"
+ fi
+ if [[ $fn == *"${header}ml-miss-ratio-timeline"* ]]; then
+ sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_ratio_timeline"
+ fi
+ if [[ $fn == *"${header}ml-mrc"* ]]; then
+ tmpfn="$fn"
+ IFS='-' read -ra elements <<< "$tmpfn"
+ target_cf_name=${elements[-1]}
+ sum_file="${result_dir}/ml_${target_cf_name}_mrc"
+ fi
+ if [[ $fn == *"${header}ml-avgmb"* ]]; then
+ tmpfn="$fn"
+ IFS='-' read -ra elements <<< "$tmpfn"
+ time_unit=${elements[3]}
+ target_cf_name=${elements[-1]}
+ sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_avgmb"
+ fi
+ if [[ $fn == *"${header}ml-p95mb"* ]]; then
+ tmpfn="$fn"
+ IFS='-' read -ra elements <<< "$tmpfn"
+ time_unit=${elements[3]}
+ target_cf_name=${elements[-1]}
+ sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_p95mb"
+ fi
+ if [[ $sum_file == "" ]]; then
+ continue
+ fi
+ if [[ $header == "header-" ]]; then
+ if [ -e "$sum_file" ]; then
+ continue
+ fi
+ fi
+ cat "$fn" >> "$sum_file"
+done
+done
+
+echo "Done"
+for fn in $result_dir/*
+do
+ if [[ $fn == *"_mrc" || $fn == *"_avgmb" || $fn == *"_p95mb" ]]; then
+ # Sort MRC file by cache_type and cache_size.
+ tmp_file="$result_dir/tmp_mrc"
+ cat "$fn" | sort -t ',' -k1,1 -k4,4n > "$tmp_file"
+ cat "$tmp_file" > "$fn"
+ rm -rf "$tmp_file"
+ fi
+done
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py
new file mode 100644
index 000000000..4b2bdeba6
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py
@@ -0,0 +1,734 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import os
+import random
+import sys
+
+from block_cache_pysim import (
+ ARCCache,
+ CacheEntry,
+ GDSizeCache,
+ HashTable,
+ HyperbolicPolicy,
+ LFUPolicy,
+ LinUCBCache,
+ LRUCache,
+ LRUPolicy,
+ MRUPolicy,
+ OPTCache,
+ OPTCacheEntry,
+ ThompsonSamplingCache,
+ TraceCache,
+ TraceRecord,
+ create_cache,
+ kMicrosInSecond,
+ kSampleSize,
+ run,
+)
+
+
+def test_hash_table():
+ print("Test hash table")
+ table = HashTable()
+ data_size = 10000
+ for i in range(data_size):
+ table.insert("k{}".format(i), i, "v{}".format(i))
+ for i in range(data_size):
+ assert table.lookup("k{}".format(i), i) is not None
+ for i in range(data_size):
+ table.delete("k{}".format(i), i)
+ for i in range(data_size):
+ assert table.lookup("k{}".format(i), i) is None
+
+ truth_map = {}
+ n = 1000000
+ records = 100
+ for i in range(n):
+ key_id = random.randint(0, records)
+ v = random.randint(0, records)
+ key = "k{}".format(key_id)
+ value = CacheEntry(v, v, v, v, v, v, v)
+ action = random.randint(0, 10)
+ assert len(truth_map) == table.elements, "{} {} {}".format(
+ len(truth_map), table.elements, i
+ )
+ if action <= 8:
+ if key in truth_map:
+ assert table.lookup(key, key_id) is not None
+ assert truth_map[key].value_size == table.lookup(key, key_id).value_size
+ else:
+ assert table.lookup(key, key_id) is None
+ table.insert(key, key_id, value)
+ truth_map[key] = value
+ else:
+ deleted = table.delete(key, key_id)
+ if deleted:
+ assert key in truth_map
+ if key in truth_map:
+ del truth_map[key]
+
+ # Check all keys are unique in the sample set.
+ for _i in range(10):
+ samples = table.random_sample(kSampleSize)
+ unique_keys = {}
+ for sample in samples:
+ unique_keys[sample.key] = True
+ assert len(samples) == len(unique_keys)
+
+ assert len(table) == len(truth_map)
+ for key in truth_map:
+ assert table.lookup(key, int(key[1:])) is not None
+ assert truth_map[key].value_size == table.lookup(key, int(key[1:])).value_size
+ print("Test hash table: Success")
+
+
+def assert_metrics(cache, expected_value, expected_value_size=1, custom_hashtable=True):
+ assert cache.used_size == expected_value[0], "Expected {}, Actual {}".format(
+ expected_value[0], cache.used_size
+ )
+ assert (
+ cache.miss_ratio_stats.num_accesses == expected_value[1]
+ ), "Expected {}, Actual {}".format(
+ expected_value[1], cache.miss_ratio_stats.num_accesses
+ )
+ assert (
+ cache.miss_ratio_stats.num_misses == expected_value[2]
+ ), "Expected {}, Actual {}".format(
+ expected_value[2], cache.miss_ratio_stats.num_misses
+ )
+ assert len(cache.table) == len(expected_value[3]) + len(
+ expected_value[4]
+ ), "Expected {}, Actual {}".format(
+ len(expected_value[3]) + len(expected_value[4]), cache.table.elements
+ )
+ for expeceted_k in expected_value[3]:
+ if custom_hashtable:
+ val = cache.table.lookup("b{}".format(expeceted_k), expeceted_k)
+ else:
+ val = cache.table["b{}".format(expeceted_k)]
+ assert val is not None, "Expected {} Actual: Not Exist {}, Table: {}".format(
+ expeceted_k, expected_value, cache.table
+ )
+ assert val.value_size == expected_value_size
+ for expeceted_k in expected_value[4]:
+ if custom_hashtable:
+ val = cache.table.lookup("g0-{}".format(expeceted_k), expeceted_k)
+ else:
+ val = cache.table["g0-{}".format(expeceted_k)]
+ assert val is not None
+ assert val.value_size == expected_value_size
+
+
+# Access k1, k1, k2, k3, k3, k3, k4
+# When k4 is inserted,
+# LRU should evict k1.
+# LFU should evict k2.
+# MRU should evict k3.
+def test_cache(cache, expected_value, custom_hashtable=True):
+ k1 = TraceRecord(
+ access_time=0,
+ block_id=1,
+ block_type=1,
+ block_size=1,
+ cf_id=0,
+ cf_name="",
+ level=0,
+ fd=0,
+ caller=1,
+ no_insert=0,
+ get_id=1,
+ key_id=1,
+ kv_size=5,
+ is_hit=1,
+ referenced_key_exist_in_block=1,
+ num_keys_in_block=0,
+ table_id=0,
+ seq_number=0,
+ block_key_size=0,
+ key_size=0,
+ block_offset_in_file=0,
+ next_access_seq_no=0,
+ )
+ k2 = TraceRecord(
+ access_time=1,
+ block_id=2,
+ block_type=1,
+ block_size=1,
+ cf_id=0,
+ cf_name="",
+ level=0,
+ fd=0,
+ caller=1,
+ no_insert=0,
+ get_id=1,
+ key_id=1,
+ kv_size=5,
+ is_hit=1,
+ referenced_key_exist_in_block=1,
+ num_keys_in_block=0,
+ table_id=0,
+ seq_number=0,
+ block_key_size=0,
+ key_size=0,
+ block_offset_in_file=0,
+ next_access_seq_no=0,
+ )
+ k3 = TraceRecord(
+ access_time=2,
+ block_id=3,
+ block_type=1,
+ block_size=1,
+ cf_id=0,
+ cf_name="",
+ level=0,
+ fd=0,
+ caller=1,
+ no_insert=0,
+ get_id=1,
+ key_id=1,
+ kv_size=5,
+ is_hit=1,
+ referenced_key_exist_in_block=1,
+ num_keys_in_block=0,
+ table_id=0,
+ seq_number=0,
+ block_key_size=0,
+ key_size=0,
+ block_offset_in_file=0,
+ next_access_seq_no=0,
+ )
+ k4 = TraceRecord(
+ access_time=3,
+ block_id=4,
+ block_type=1,
+ block_size=1,
+ cf_id=0,
+ cf_name="",
+ level=0,
+ fd=0,
+ caller=1,
+ no_insert=0,
+ get_id=1,
+ key_id=1,
+ kv_size=5,
+ is_hit=1,
+ referenced_key_exist_in_block=1,
+ num_keys_in_block=0,
+ table_id=0,
+ seq_number=0,
+ block_key_size=0,
+ key_size=0,
+ block_offset_in_file=0,
+ next_access_seq_no=0,
+ )
+ sequence = [k1, k1, k2, k3, k3, k3]
+ index = 0
+ expected_values = []
+ # Access k1, miss.
+ expected_values.append([1, 1, 1, [1], []])
+ # Access k1, hit.
+ expected_values.append([1, 2, 1, [1], []])
+ # Access k2, miss.
+ expected_values.append([2, 3, 2, [1, 2], []])
+ # Access k3, miss.
+ expected_values.append([3, 4, 3, [1, 2, 3], []])
+ # Access k3, hit.
+ expected_values.append([3, 5, 3, [1, 2, 3], []])
+ # Access k3, hit.
+ expected_values.append([3, 6, 3, [1, 2, 3], []])
+ access_time = 0
+ for access in sequence:
+ access.access_time = access_time
+ cache.access(access)
+ assert_metrics(
+ cache,
+ expected_values[index],
+ expected_value_size=1,
+ custom_hashtable=custom_hashtable,
+ )
+ access_time += 1
+ index += 1
+ k4.access_time = access_time
+ cache.access(k4)
+ assert_metrics(
+ cache, expected_value, expected_value_size=1, custom_hashtable=custom_hashtable
+ )
+
+
+def test_lru_cache(cache, custom_hashtable):
+ print("Test LRU cache")
+ # Access k4, miss. evict k1
+ test_cache(cache, [3, 7, 4, [2, 3, 4], []], custom_hashtable)
+ print("Test LRU cache: Success")
+
+
+def test_mru_cache():
+ print("Test MRU cache")
+ policies = []
+ policies.append(MRUPolicy())
+ # Access k4, miss. evict k3
+ test_cache(
+ ThompsonSamplingCache(3, False, policies, cost_class_label=None),
+ [3, 7, 4, [1, 2, 4], []],
+ )
+ print("Test MRU cache: Success")
+
+
+def test_lfu_cache():
+ print("Test LFU cache")
+ policies = []
+ policies.append(LFUPolicy())
+ # Access k4, miss. evict k2
+ test_cache(
+ ThompsonSamplingCache(3, False, policies, cost_class_label=None),
+ [3, 7, 4, [1, 3, 4], []],
+ )
+ print("Test LFU cache: Success")
+
+
+def test_mix(cache):
+ print("Test Mix {} cache".format(cache.cache_name()))
+ n = 100000
+ records = 100
+ block_size_table = {}
+ trace_num_misses = 0
+ for i in range(n):
+ key_id = random.randint(0, records)
+ vs = random.randint(0, 10)
+ now = i * kMicrosInSecond
+ block_size = vs
+ if key_id in block_size_table:
+ block_size = block_size_table[key_id]
+ else:
+ block_size_table[key_id] = block_size
+ is_hit = key_id % 2
+ if is_hit == 0:
+ trace_num_misses += 1
+ k = TraceRecord(
+ access_time=now,
+ block_id=key_id,
+ block_type=1,
+ block_size=block_size,
+ cf_id=0,
+ cf_name="",
+ level=0,
+ fd=0,
+ caller=1,
+ no_insert=0,
+ get_id=key_id,
+ key_id=key_id,
+ kv_size=5,
+ is_hit=is_hit,
+ referenced_key_exist_in_block=1,
+ num_keys_in_block=0,
+ table_id=0,
+ seq_number=0,
+ block_key_size=0,
+ key_size=0,
+ block_offset_in_file=0,
+ next_access_seq_no=vs,
+ )
+ cache.access(k)
+ assert cache.miss_ratio_stats.miss_ratio() > 0
+ if cache.cache_name() == "Trace":
+ assert cache.miss_ratio_stats.num_accesses == n
+ assert cache.miss_ratio_stats.num_misses == trace_num_misses
+ else:
+ assert cache.used_size <= cache.cache_size
+ all_values = cache.table.values()
+ cached_size = 0
+ for value in all_values:
+ cached_size += value.value_size
+ assert cached_size == cache.used_size, "Expeced {} Actual {}".format(
+ cache.used_size, cached_size
+ )
+ print("Test Mix {} cache: Success".format(cache.cache_name()))
+
+
+def test_end_to_end():
+ print("Test All caches")
+ n = 100000
+ nblocks = 1000
+ block_size = 16 * 1024
+ ncfs = 7
+ nlevels = 6
+ nfds = 100000
+ trace_file_path = "test_trace"
+ # All blocks are of the same size so that OPT must achieve the lowest miss
+ # ratio.
+ with open(trace_file_path, "w+") as trace_file:
+ access_records = ""
+ for i in range(n):
+ key_id = random.randint(0, nblocks)
+ cf_id = random.randint(0, ncfs)
+ level = random.randint(0, nlevels)
+ fd = random.randint(0, nfds)
+ now = i * kMicrosInSecond
+ access_record = ""
+ access_record += "{},".format(now)
+ access_record += "{},".format(key_id)
+ access_record += "{},".format(9) # block type
+ access_record += "{},".format(block_size) # block size
+ access_record += "{},".format(cf_id)
+ access_record += "cf_{},".format(cf_id)
+ access_record += "{},".format(level)
+ access_record += "{},".format(fd)
+ access_record += "{},".format(key_id % 3) # caller
+ access_record += "{},".format(0) # no insert
+ access_record += "{},".format(i) # get_id
+ access_record += "{},".format(i) # key_id
+ access_record += "{},".format(100) # kv_size
+ access_record += "{},".format(1) # is_hit
+ access_record += "{},".format(1) # referenced_key_exist_in_block
+ access_record += "{},".format(10) # num_keys_in_block
+ access_record += "{},".format(1) # table_id
+ access_record += "{},".format(0) # seq_number
+ access_record += "{},".format(10) # block key size
+ access_record += "{},".format(20) # key size
+ access_record += "{},".format(0) # block offset
+ access_record = access_record[:-1]
+ access_records += access_record + "\n"
+ trace_file.write(access_records)
+
+ print("Test All caches: Start testing caches")
+ cache_size = block_size * nblocks / 10
+ downsample_size = 1
+ cache_ms = {}
+ for cache_type in [
+ "ts",
+ "opt",
+ "lru",
+ "pylru",
+ "linucb",
+ "gdsize",
+ "pyccbt",
+ "pycctbbt",
+ ]:
+ cache = create_cache(cache_type, cache_size, downsample_size)
+ run(trace_file_path, cache_type, cache, 0, -1, "all")
+ cache_ms[cache_type] = cache
+ assert cache.miss_ratio_stats.num_accesses == n
+
+ for cache_type in cache_ms:
+ cache = cache_ms[cache_type]
+ ms = cache.miss_ratio_stats.miss_ratio()
+ assert ms <= 100.0 and ms >= 0.0
+ # OPT should perform the best.
+ assert cache_ms["opt"].miss_ratio_stats.miss_ratio() <= ms
+ assert cache.used_size <= cache.cache_size
+ all_values = cache.table.values()
+ cached_size = 0
+ for value in all_values:
+ cached_size += value.value_size
+ assert cached_size == cache.used_size, "Expeced {} Actual {}".format(
+ cache.used_size, cached_size
+ )
+ print("Test All {}: Success".format(cache.cache_name()))
+
+ os.remove(trace_file_path)
+ print("Test All: Success")
+
+
+def test_hybrid(cache):
+ print("Test {} cache".format(cache.cache_name()))
+ k = TraceRecord(
+ access_time=0,
+ block_id=1,
+ block_type=1,
+ block_size=1,
+ cf_id=0,
+ cf_name="",
+ level=0,
+ fd=0,
+ caller=1,
+ no_insert=0,
+ get_id=1, # the first get request.
+ key_id=1,
+ kv_size=0, # no size.
+ is_hit=1,
+ referenced_key_exist_in_block=1,
+ num_keys_in_block=0,
+ table_id=0,
+ seq_number=0,
+ block_key_size=0,
+ key_size=0,
+ block_offset_in_file=0,
+ next_access_seq_no=0,
+ )
+ cache.access(k) # Expect a miss.
+ # used size, num accesses, num misses, hash table size, blocks, get keys.
+ assert_metrics(cache, [1, 1, 1, [1], []])
+ k.access_time += 1
+ k.kv_size = 1
+ k.block_id = 2
+ cache.access(k) # k should be inserted.
+ assert_metrics(cache, [3, 2, 2, [1, 2], [1]])
+ k.access_time += 1
+ k.block_id = 3
+ cache.access(k) # k should not be inserted again.
+ assert_metrics(cache, [4, 3, 3, [1, 2, 3], [1]])
+ # A second get request referencing the same key.
+ k.access_time += 1
+ k.get_id = 2
+ k.block_id = 4
+ k.kv_size = 0
+ cache.access(k) # k should observe a hit. No block access.
+ assert_metrics(cache, [4, 4, 3, [1, 2, 3], [1]])
+
+ # A third get request searches three files, three different keys.
+ # And the second key observes a hit.
+ k.access_time += 1
+ k.kv_size = 1
+ k.get_id = 3
+ k.block_id = 3
+ k.key_id = 2
+ cache.access(k) # k should observe a miss. block 3 observes a hit.
+ assert_metrics(cache, [5, 5, 3, [1, 2, 3], [1, 2]])
+
+ k.access_time += 1
+ k.kv_size = 1
+ k.get_id = 3
+ k.block_id = 4
+ k.kv_size = 1
+ k.key_id = 1
+ cache.access(k) # k1 should observe a hit.
+ assert_metrics(cache, [5, 6, 3, [1, 2, 3], [1, 2]])
+
+ k.access_time += 1
+ k.kv_size = 1
+ k.get_id = 3
+ k.block_id = 4
+ k.kv_size = 1
+ k.key_id = 3
+ # k3 should observe a miss.
+ # However, as the get already complete, we should not access k3 any more.
+ cache.access(k)
+ assert_metrics(cache, [5, 7, 3, [1, 2, 3], [1, 2]])
+
+ # A fourth get request searches one file and two blocks. One row key.
+ k.access_time += 1
+ k.get_id = 4
+ k.block_id = 5
+ k.key_id = 4
+ k.kv_size = 1
+ cache.access(k)
+ assert_metrics(cache, [7, 8, 4, [1, 2, 3, 5], [1, 2, 4]])
+
+ # A bunch of insertions which evict cached row keys.
+ for i in range(6, 100):
+ k.access_time += 1
+ k.get_id = 0
+ k.block_id = i
+ cache.access(k)
+
+ k.get_id = 4
+ k.block_id = 100 # A different block.
+ k.key_id = 4 # Same row key and should not be inserted again.
+ k.kv_size = 1
+ cache.access(k)
+ assert_metrics(
+ cache, [kSampleSize, 103, 99, [i for i in range(101 - kSampleSize, 101)], []]
+ )
+ print("Test {} cache: Success".format(cache.cache_name()))
+
+
+def test_opt_cache():
+ print("Test OPT cache")
+ cache = OPTCache(3)
+ # seq: 0, 1, 2, 3, 4, 5, 6, 7, 8
+ # key: k1, k2, k3, k4, k5, k6, k7, k1, k8
+ # next_access: 7, 19, 18, M, M, 17, 16, 25, M
+ k = TraceRecord(
+ access_time=0,
+ block_id=1,
+ block_type=1,
+ block_size=1,
+ cf_id=0,
+ cf_name="",
+ level=0,
+ fd=0,
+ caller=1,
+ no_insert=0,
+ get_id=1, # the first get request.
+ key_id=1,
+ kv_size=0, # no size.
+ is_hit=1,
+ referenced_key_exist_in_block=1,
+ num_keys_in_block=0,
+ table_id=0,
+ seq_number=0,
+ block_key_size=0,
+ key_size=0,
+ block_offset_in_file=0,
+ next_access_seq_no=7,
+ )
+ cache.access(k)
+ assert_metrics(
+ cache, [1, 1, 1, [1], []], expected_value_size=1, custom_hashtable=False
+ )
+ k.access_time += 1
+ k.block_id = 2
+ k.next_access_seq_no = 19
+ cache.access(k)
+ assert_metrics(
+ cache, [2, 2, 2, [1, 2], []], expected_value_size=1, custom_hashtable=False
+ )
+ k.access_time += 1
+ k.block_id = 3
+ k.next_access_seq_no = 18
+ cache.access(k)
+ assert_metrics(
+ cache, [3, 3, 3, [1, 2, 3], []], expected_value_size=1, custom_hashtable=False
+ )
+ k.access_time += 1
+ k.block_id = 4
+ k.next_access_seq_no = sys.maxsize # Never accessed again.
+ cache.access(k)
+ # Evict 2 since its next access 19 is the furthest in the future.
+ assert_metrics(
+ cache, [3, 4, 4, [1, 3, 4], []], expected_value_size=1, custom_hashtable=False
+ )
+ k.access_time += 1
+ k.block_id = 5
+ k.next_access_seq_no = sys.maxsize # Never accessed again.
+ cache.access(k)
+ # Evict 4 since its next access MAXINT is the furthest in the future.
+ assert_metrics(
+ cache, [3, 5, 5, [1, 3, 5], []], expected_value_size=1, custom_hashtable=False
+ )
+ k.access_time += 1
+ k.block_id = 6
+ k.next_access_seq_no = 17
+ cache.access(k)
+ # Evict 5 since its next access MAXINT is the furthest in the future.
+ assert_metrics(
+ cache, [3, 6, 6, [1, 3, 6], []], expected_value_size=1, custom_hashtable=False
+ )
+ k.access_time += 1
+ k.block_id = 7
+ k.next_access_seq_no = 16
+ cache.access(k)
+ # Evict 3 since its next access 18 is the furthest in the future.
+ assert_metrics(
+ cache, [3, 7, 7, [1, 6, 7], []], expected_value_size=1, custom_hashtable=False
+ )
+ k.access_time += 1
+ k.block_id = 1
+ k.next_access_seq_no = 25
+ cache.access(k)
+ assert_metrics(
+ cache, [3, 8, 7, [1, 6, 7], []], expected_value_size=1, custom_hashtable=False
+ )
+ k.access_time += 1
+ k.block_id = 8
+ k.next_access_seq_no = sys.maxsize
+ cache.access(k)
+ # Evict 1 since its next access 25 is the furthest in the future.
+ assert_metrics(
+ cache, [3, 9, 8, [6, 7, 8], []], expected_value_size=1, custom_hashtable=False
+ )
+
+ # Insert a large kv pair to evict all keys.
+ k.access_time += 1
+ k.block_id = 10
+ k.block_size = 3
+ k.next_access_seq_no = sys.maxsize
+ cache.access(k)
+ assert_metrics(
+ cache, [3, 10, 9, [10], []], expected_value_size=3, custom_hashtable=False
+ )
+ print("Test OPT cache: Success")
+
+
+def test_trace_cache():
+ print("Test trace cache")
+ cache = TraceCache(0)
+ k = TraceRecord(
+ access_time=0,
+ block_id=1,
+ block_type=1,
+ block_size=1,
+ cf_id=0,
+ cf_name="",
+ level=0,
+ fd=0,
+ caller=1,
+ no_insert=0,
+ get_id=1,
+ key_id=1,
+ kv_size=0,
+ is_hit=1,
+ referenced_key_exist_in_block=1,
+ num_keys_in_block=0,
+ table_id=0,
+ seq_number=0,
+ block_key_size=0,
+ key_size=0,
+ block_offset_in_file=0,
+ next_access_seq_no=7,
+ )
+ cache.access(k)
+ assert cache.miss_ratio_stats.num_accesses == 1
+ assert cache.miss_ratio_stats.num_misses == 0
+ k.is_hit = 0
+ cache.access(k)
+ assert cache.miss_ratio_stats.num_accesses == 2
+ assert cache.miss_ratio_stats.num_misses == 1
+ print("Test trace cache: Success")
+
+
+if __name__ == "__main__":
+ test_hash_table()
+ test_trace_cache()
+ test_opt_cache()
+ test_lru_cache(
+ ThompsonSamplingCache(
+ 3, enable_cache_row_key=0, policies=[LRUPolicy()], cost_class_label=None
+ ),
+ custom_hashtable=True,
+ )
+ test_lru_cache(LRUCache(3, enable_cache_row_key=0), custom_hashtable=False)
+ test_mru_cache()
+ test_lfu_cache()
+ test_hybrid(
+ ThompsonSamplingCache(
+ kSampleSize,
+ enable_cache_row_key=1,
+ policies=[LRUPolicy()],
+ cost_class_label=None,
+ )
+ )
+ test_hybrid(
+ LinUCBCache(
+ kSampleSize,
+ enable_cache_row_key=1,
+ policies=[LRUPolicy()],
+ cost_class_label=None,
+ )
+ )
+ for cache_type in [
+ "ts",
+ "opt",
+ "arc",
+ "pylfu",
+ "pymru",
+ "trace",
+ "pyhb",
+ "lru",
+ "pylru",
+ "linucb",
+ "gdsize",
+ "pycctbbt",
+ "pycctb",
+ "pyccbt",
+ ]:
+ for enable_row_cache in [0, 1, 2]:
+ cache_type_str = cache_type
+ if cache_type != "opt" and cache_type != "trace":
+ if enable_row_cache == 1:
+ cache_type_str += "_hybrid"
+ elif enable_row_cache == 2:
+ cache_type_str += "_hybridn"
+ test_mix(create_cache(cache_type_str, cache_size=100, downsample_size=1))
+ test_end_to_end()
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
new file mode 100644
index 000000000..f90cb794b
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
@@ -0,0 +1,2308 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifdef GFLAGS
+#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <sstream>
+
+#include "monitoring/histogram.h"
+#include "util/gflags_compat.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_string(block_cache_trace_path, "", "The trace file path.");
+DEFINE_bool(is_block_cache_human_readable_trace, false,
+ "Is the trace file provided for analysis generated by running "
+ "block_cache_trace_analyzer with "
+ "FLAGS_human_readable_trace_file_path is specified.");
+DEFINE_string(
+ block_cache_sim_config_path, "",
+ "The config file path. One cache configuration per line. The format of a "
+ "cache configuration is "
+ "cache_name,num_shard_bits,ghost_capacity,cache_capacity_1,...,cache_"
+ "capacity_N. Supported cache names are lru, lru_priority, lru_hybrid, and "
+ "lru_hybrid_no_insert_on_row_miss. User may also add a prefix 'ghost_' to "
+ "a cache_name to add a ghost cache in front of the real cache. "
+ "ghost_capacity and cache_capacity can be xK, xM or xG where x is a "
+ "positive number.");
+DEFINE_int32(block_cache_trace_downsample_ratio, 1,
+ "The trace collected accesses on one in every "
+ "block_cache_trace_downsample_ratio blocks. We scale "
+ "down the simulated cache size by this ratio.");
+DEFINE_bool(print_block_size_stats, false,
+ "Print block size distribution and the distribution break down by "
+ "block type and column family.");
+DEFINE_bool(print_access_count_stats, false,
+ "Print access count distribution and the distribution break down "
+ "by block type and column family.");
+DEFINE_bool(print_data_block_access_count_stats, false,
+ "Print data block accesses by user Get and Multi-Get.");
+DEFINE_int32(cache_sim_warmup_seconds, 0,
+ "The number of seconds to warmup simulated caches. The hit/miss "
+ "counters are reset after the warmup completes.");
+DEFINE_int32(analyze_bottom_k_access_count_blocks, 0,
+ "Print out detailed access information for blocks with their "
+ "number of accesses are the bottom k among all blocks.");
+DEFINE_int32(analyze_top_k_access_count_blocks, 0,
+ "Print out detailed access information for blocks with their "
+ "number of accesses are the top k among all blocks.");
+DEFINE_string(block_cache_analysis_result_dir, "",
+ "The directory that saves block cache analysis results.");
+DEFINE_string(
+ timeline_labels, "",
+ "Group the number of accesses per block per second using these labels. "
+ "Possible labels are a combination of the following: cf (column family), "
+ "sst, level, bt (block type), caller, block. For example, label \"cf_bt\" "
+ "means the number of acccess per second is grouped by unique pairs of "
+ "\"cf_bt\". A label \"all\" contains the aggregated number of accesses per "
+ "second across all possible labels.");
+DEFINE_string(reuse_distance_labels, "",
+ "Group the reuse distance of a block using these labels. Reuse "
+ "distance is defined as the cumulated size of unique blocks read "
+ "between two consecutive accesses on the same block.");
+DEFINE_string(
+ reuse_distance_buckets, "",
+ "Group blocks by their reuse distances given these buckets. For "
+ "example, if 'reuse_distance_buckets' is '1K,1M,1G', we will "
+ "create four buckets. The first three buckets contain the number of "
+ "blocks with reuse distance less than 1KB, between 1K and 1M, between 1M "
+ "and 1G, respectively. The last bucket contains the number of blocks with "
+ "reuse distance larger than 1G. ");
+DEFINE_string(
+ reuse_interval_labels, "",
+ "Group the reuse interval of a block using these labels. Reuse "
+ "interval is defined as the time between two consecutive accesses "
+ "on the same block.");
+DEFINE_string(
+ reuse_interval_buckets, "",
+ "Group blocks by their reuse interval given these buckets. For "
+ "example, if 'reuse_distance_buckets' is '1,10,100', we will "
+ "create four buckets. The first three buckets contain the number of "
+ "blocks with reuse interval less than 1 second, between 1 second and 10 "
+ "seconds, between 10 seconds and 100 seconds, respectively. The last "
+ "bucket contains the number of blocks with reuse interval longer than 100 "
+ "seconds.");
+DEFINE_string(
+ reuse_lifetime_labels, "",
+ "Group the reuse lifetime of a block using these labels. Reuse "
+ "lifetime is defined as the time interval between the first access on a "
+ "block and the last access on the same block. For blocks that are only "
+ "accessed once, its lifetime is set to kMaxUint64.");
+DEFINE_string(
+ reuse_lifetime_buckets, "",
+ "Group blocks by their reuse lifetime given these buckets. For "
+ "example, if 'reuse_lifetime_buckets' is '1,10,100', we will "
+ "create four buckets. The first three buckets contain the number of "
+ "blocks with reuse lifetime less than 1 second, between 1 second and 10 "
+ "seconds, between 10 seconds and 100 seconds, respectively. The last "
+ "bucket contains the number of blocks with reuse lifetime longer than 100 "
+ "seconds.");
+DEFINE_string(
+ analyze_callers, "",
+ "The list of callers to perform a detailed analysis on. If speicfied, the "
+ "analyzer will output a detailed percentage of accesses for each caller "
+ "break down by column family, level, and block type. A list of available "
+ "callers are: Get, MultiGet, Iterator, ApproximateSize, VerifyChecksum, "
+ "SSTDumpTool, ExternalSSTIngestion, Repair, Prefetch, Compaction, "
+ "CompactionRefill, Flush, SSTFileReader, Uncategorized.");
+DEFINE_string(access_count_buckets, "",
+ "Group number of blocks by their access count given these "
+ "buckets. If specified, the analyzer will output a detailed "
+ "analysis on the number of blocks grouped by their access count "
+ "break down by block type and column family.");
+DEFINE_int32(analyze_blocks_reuse_k_reuse_window, 0,
+ "Analyze the percentage of blocks that are accessed in the "
+ "[k, 2*k] seconds are accessed again in the next [2*k, 3*k], "
+ "[3*k, 4*k],...,[k*(n-1), k*n] seconds. ");
+DEFINE_string(analyze_get_spatial_locality_labels, "",
+ "Group data blocks using these labels.");
+DEFINE_string(analyze_get_spatial_locality_buckets, "",
+ "Group data blocks by their statistics using these buckets.");
+DEFINE_string(skew_labels, "",
+ "Group the access count of a block using these labels.");
+DEFINE_string(skew_buckets, "", "Group the skew labels using these buckets.");
+DEFINE_bool(mrc_only, false,
+ "Evaluate alternative cache policies only. When this flag is true, "
+ "the analyzer does NOT maintain states of each block in memory for "
+ "analysis. It only feeds the accesses into the cache simulators.");
+DEFINE_string(
+ analyze_correlation_coefficients_labels, "",
+ "Analyze the correlation coefficients of features such as number of past "
+ "accesses with regard to the number of accesses till the next access.");
+DEFINE_int32(analyze_correlation_coefficients_max_number_of_values, 1000000,
+ "The maximum number of values for a feature. If the number of "
+ "values for a feature is larger than this max, it randomly "
+ "selects 'max' number of values.");
+DEFINE_string(human_readable_trace_file_path, "",
+ "The filt path that saves human readable access records.");
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+
+const std::string kMissRatioCurveFileName = "mrc";
+const std::string kGroupbyBlock = "block";
+const std::string kGroupbyTable = "table";
+const std::string kGroupbyColumnFamily = "cf";
+const std::string kGroupbySSTFile = "sst";
+const std::string kGroupbyBlockType = "bt";
+const std::string kGroupbyCaller = "caller";
+const std::string kGroupbyLevel = "level";
+const std::string kGroupbyAll = "all";
+const std::set<std::string> kGroupbyLabels{
+ kGroupbyBlock, kGroupbyColumnFamily, kGroupbySSTFile, kGroupbyLevel,
+ kGroupbyBlockType, kGroupbyCaller, kGroupbyAll};
+const std::string kSupportedCacheNames =
+ " lru ghost_lru lru_priority ghost_lru_priority lru_hybrid "
+ "ghost_lru_hybrid lru_hybrid_no_insert_on_row_miss "
+ "ghost_lru_hybrid_no_insert_on_row_miss ";
+
+// The suffix for the generated csv files.
+const std::string kFileNameSuffixMissRatioTimeline = "miss_ratio_timeline";
+const std::string kFileNameSuffixMissTimeline = "miss_timeline";
+const std::string kFileNameSuffixSkew = "skewness";
+const std::string kFileNameSuffixAccessTimeline = "access_timeline";
+const std::string kFileNameSuffixCorrelation = "correlation_input";
+const std::string kFileNameSuffixAvgReuseIntervalNaccesses =
+ "avg_reuse_interval_naccesses";
+const std::string kFileNameSuffixAvgReuseInterval = "avg_reuse_interval";
+const std::string kFileNameSuffixReuseInterval = "access_reuse_interval";
+const std::string kFileNameSuffixReuseLifetime = "reuse_lifetime";
+const std::string kFileNameSuffixAccessReuseBlocksTimeline =
+ "reuse_blocks_timeline";
+const std::string kFileNameSuffixPercentOfAccessSummary =
+ "percentage_of_accesses_summary";
+const std::string kFileNameSuffixPercentRefKeys = "percent_ref_keys";
+const std::string kFileNameSuffixPercentDataSizeOnRefKeys =
+ "percent_data_size_on_ref_keys";
+const std::string kFileNameSuffixPercentAccessesOnRefKeys =
+ "percent_accesses_on_ref_keys";
+const std::string kFileNameSuffixAccessCountSummary = "access_count_summary";
+
+std::string block_type_to_string(TraceType type) {
+ switch (type) {
+ case kBlockTraceFilterBlock:
+ return "Filter";
+ case kBlockTraceDataBlock:
+ return "Data";
+ case kBlockTraceIndexBlock:
+ return "Index";
+ case kBlockTraceRangeDeletionBlock:
+ return "RangeDeletion";
+ case kBlockTraceUncompressionDictBlock:
+ return "UncompressionDict";
+ default:
+ break;
+ }
+ // This cannot happen.
+ return "InvalidType";
+}
+
+std::string caller_to_string(TableReaderCaller caller) {
+ switch (caller) {
+ case kUserGet:
+ return "Get";
+ case kUserMultiGet:
+ return "MultiGet";
+ case kUserIterator:
+ return "Iterator";
+ case kUserApproximateSize:
+ return "ApproximateSize";
+ case kUserVerifyChecksum:
+ return "VerifyChecksum";
+ case kSSTDumpTool:
+ return "SSTDumpTool";
+ case kExternalSSTIngestion:
+ return "ExternalSSTIngestion";
+ case kRepair:
+ return "Repair";
+ case kPrefetch:
+ return "Prefetch";
+ case kCompaction:
+ return "Compaction";
+ case kCompactionRefill:
+ return "CompactionRefill";
+ case kFlush:
+ return "Flush";
+ case kSSTFileReader:
+ return "SSTFileReader";
+ case kUncategorized:
+ return "Uncategorized";
+ default:
+ break;
+ }
+ // This cannot happen.
+ return "InvalidCaller";
+}
+
+TableReaderCaller string_to_caller(std::string caller_str) {
+ if (caller_str == "Get") {
+ return kUserGet;
+ } else if (caller_str == "MultiGet") {
+ return kUserMultiGet;
+ } else if (caller_str == "Iterator") {
+ return kUserIterator;
+ } else if (caller_str == "ApproximateSize") {
+ return kUserApproximateSize;
+ } else if (caller_str == "VerifyChecksum") {
+ return kUserVerifyChecksum;
+ } else if (caller_str == "SSTDumpTool") {
+ return kSSTDumpTool;
+ } else if (caller_str == "ExternalSSTIngestion") {
+ return kExternalSSTIngestion;
+ } else if (caller_str == "Repair") {
+ return kRepair;
+ } else if (caller_str == "Prefetch") {
+ return kPrefetch;
+ } else if (caller_str == "Compaction") {
+ return kCompaction;
+ } else if (caller_str == "CompactionRefill") {
+ return kCompactionRefill;
+ } else if (caller_str == "Flush") {
+ return kFlush;
+ } else if (caller_str == "SSTFileReader") {
+ return kSSTFileReader;
+ } else if (caller_str == "Uncategorized") {
+ return kUncategorized;
+ }
+ return TableReaderCaller::kMaxBlockCacheLookupCaller;
+}
+
+bool is_user_access(TableReaderCaller caller) {
+ switch (caller) {
+ case kUserGet:
+ case kUserMultiGet:
+ case kUserIterator:
+ case kUserApproximateSize:
+ case kUserVerifyChecksum:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+const char kBreakLine[] =
+ "***************************************************************\n";
+
+void print_break_lines(uint32_t num_break_lines) {
+ for (uint32_t i = 0; i < num_break_lines; i++) {
+ fprintf(stdout, kBreakLine);
+ }
+}
+
+double percent(uint64_t numerator, uint64_t denomenator) {
+ if (denomenator == 0) {
+ return -1;
+ }
+ return static_cast<double>(numerator * 100.0 / denomenator);
+}
+
+std::map<uint64_t, uint64_t> adjust_time_unit(
+ const std::map<uint64_t, uint64_t>& time_stats, uint64_t time_unit) {
+ if (time_unit == 1) {
+ return time_stats;
+ }
+ std::map<uint64_t, uint64_t> adjusted_time_stats;
+ for (auto const& time : time_stats) {
+ adjusted_time_stats[static_cast<uint64_t>(time.first / time_unit)] +=
+ time.second;
+ }
+ return adjusted_time_stats;
+}
+} // namespace
+
+void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const {
+ if (!cache_simulator_) {
+ return;
+ }
+ if (output_dir_.empty()) {
+ return;
+ }
+ uint64_t trace_duration =
+ trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+ uint64_t total_accesses = access_sequence_number_;
+ const std::string output_miss_ratio_curve_path =
+ output_dir_ + "/" + std::to_string(trace_duration) + "_" +
+ std::to_string(total_accesses) + "_" + kMissRatioCurveFileName;
+ std::ofstream out(output_miss_ratio_curve_path);
+ if (!out.is_open()) {
+ return;
+ }
+ // Write header.
+ const std::string header =
+ "cache_name,num_shard_bits,ghost_capacity,capacity,miss_ratio,total_"
+ "accesses";
+ out << header << std::endl;
+ for (auto const& config_caches : cache_simulator_->sim_caches()) {
+ const CacheConfiguration& config = config_caches.first;
+ for (uint32_t i = 0; i < config.cache_capacities.size(); i++) {
+ double miss_ratio =
+ config_caches.second[i]->miss_ratio_stats().miss_ratio();
+ // Write the body.
+ out << config.cache_name;
+ out << ",";
+ out << config.num_shard_bits;
+ out << ",";
+ out << config.ghost_cache_capacity;
+ out << ",";
+ out << config.cache_capacities[i];
+ out << ",";
+ out << std::fixed << std::setprecision(4) << miss_ratio;
+ out << ",";
+ out << config_caches.second[i]->miss_ratio_stats().total_accesses();
+ out << std::endl;
+ }
+ }
+ out.close();
+}
+
+void BlockCacheTraceAnalyzer::UpdateFeatureVectors(
+ const std::vector<uint64_t>& access_sequence_number_timeline,
+ const std::vector<uint64_t>& access_timeline, const std::string& label,
+ std::map<std::string, Features>* label_features,
+ std::map<std::string, Predictions>* label_predictions) const {
+ if (access_sequence_number_timeline.empty() || access_timeline.empty()) {
+ return;
+ }
+ assert(access_timeline.size() == access_sequence_number_timeline.size());
+ uint64_t prev_access_sequence_number = access_sequence_number_timeline[0];
+ uint64_t prev_access_timestamp = access_timeline[0];
+ for (uint32_t i = 0; i < access_sequence_number_timeline.size(); i++) {
+ uint64_t num_accesses_since_last_access =
+ access_sequence_number_timeline[i] - prev_access_sequence_number;
+ uint64_t elapsed_time_since_last_access =
+ access_timeline[i] - prev_access_timestamp;
+ prev_access_sequence_number = access_sequence_number_timeline[i];
+ prev_access_timestamp = access_timeline[i];
+ if (i < access_sequence_number_timeline.size() - 1) {
+ (*label_features)[label].num_accesses_since_last_access.push_back(
+ num_accesses_since_last_access);
+ (*label_features)[label].num_past_accesses.push_back(i);
+ (*label_features)[label].elapsed_time_since_last_access.push_back(
+ elapsed_time_since_last_access);
+ }
+ if (i >= 1) {
+ (*label_predictions)[label].num_accesses_till_next_access.push_back(
+ num_accesses_since_last_access);
+ (*label_predictions)[label].elapsed_time_till_next_access.push_back(
+ elapsed_time_since_last_access);
+ }
+ }
+}
+
+void BlockCacheTraceAnalyzer::WriteMissRatioTimeline(uint64_t time_unit) const {
+ if (!cache_simulator_ || output_dir_.empty()) {
+ return;
+ }
+ std::map<uint64_t, std::map<std::string, std::map<uint64_t, double>>>
+ cs_name_timeline;
+ uint64_t start_time = port::kMaxUint64;
+ uint64_t end_time = 0;
+ const std::map<uint64_t, uint64_t>& trace_num_misses =
+ adjust_time_unit(miss_ratio_stats_.num_misses_timeline(), time_unit);
+ const std::map<uint64_t, uint64_t>& trace_num_accesses =
+ adjust_time_unit(miss_ratio_stats_.num_accesses_timeline(), time_unit);
+ assert(trace_num_misses.size() == trace_num_accesses.size());
+ for (auto const& num_miss : trace_num_misses) {
+ uint64_t time = num_miss.first;
+ start_time = std::min(start_time, time);
+ end_time = std::max(end_time, time);
+ uint64_t miss = num_miss.second;
+ auto it = trace_num_accesses.find(time);
+ assert(it != trace_num_accesses.end());
+ uint64_t access = it->second;
+ cs_name_timeline[port::kMaxUint64]["trace"][time] = percent(miss, access);
+ }
+ for (auto const& config_caches : cache_simulator_->sim_caches()) {
+ const CacheConfiguration& config = config_caches.first;
+ std::string cache_label = config.cache_name + "-" +
+ std::to_string(config.num_shard_bits) + "-" +
+ std::to_string(config.ghost_cache_capacity);
+ for (uint32_t i = 0; i < config.cache_capacities.size(); i++) {
+ const std::map<uint64_t, uint64_t>& num_misses = adjust_time_unit(
+ config_caches.second[i]->miss_ratio_stats().num_misses_timeline(),
+ time_unit);
+ const std::map<uint64_t, uint64_t>& num_accesses = adjust_time_unit(
+ config_caches.second[i]->miss_ratio_stats().num_accesses_timeline(),
+ time_unit);
+ assert(num_misses.size() == num_accesses.size());
+ for (auto const& num_miss : num_misses) {
+ uint64_t time = num_miss.first;
+ start_time = std::min(start_time, time);
+ end_time = std::max(end_time, time);
+ uint64_t miss = num_miss.second;
+ auto it = num_accesses.find(time);
+ assert(it != num_accesses.end());
+ uint64_t access = it->second;
+ cs_name_timeline[config.cache_capacities[i]][cache_label][time] =
+ percent(miss, access);
+ }
+ }
+ }
+ for (auto const& it : cs_name_timeline) {
+ const std::string output_miss_ratio_timeline_path =
+ output_dir_ + "/" + std::to_string(it.first) + "_" +
+ std::to_string(time_unit) + "_" + kFileNameSuffixMissRatioTimeline;
+ std::ofstream out(output_miss_ratio_timeline_path);
+ if (!out.is_open()) {
+ return;
+ }
+ std::string header("time");
+ for (uint64_t now = start_time; now <= end_time; now++) {
+ header += ",";
+ header += std::to_string(now);
+ }
+ out << header << std::endl;
+ for (auto const& label : it.second) {
+ std::string row(label.first);
+ for (uint64_t now = start_time; now <= end_time; now++) {
+ auto misses = label.second.find(now);
+ row += ",";
+ if (misses != label.second.end()) {
+ row += std::to_string(misses->second);
+ } else {
+ row += "0";
+ }
+ }
+ out << row << std::endl;
+ }
+ out.close();
+ }
+}
+
+void BlockCacheTraceAnalyzer::WriteMissTimeline(uint64_t time_unit) const {
+ if (!cache_simulator_ || output_dir_.empty()) {
+ return;
+ }
+ std::map<uint64_t, std::map<std::string, std::map<uint64_t, uint64_t>>>
+ cs_name_timeline;
+ uint64_t start_time = port::kMaxUint64;
+ uint64_t end_time = 0;
+ const std::map<uint64_t, uint64_t>& trace_num_misses =
+ adjust_time_unit(miss_ratio_stats_.num_misses_timeline(), time_unit);
+ for (auto const& num_miss : trace_num_misses) {
+ uint64_t time = num_miss.first;
+ start_time = std::min(start_time, time);
+ end_time = std::max(end_time, time);
+ uint64_t miss = num_miss.second;
+ cs_name_timeline[port::kMaxUint64]["trace"][time] = miss;
+ }
+ for (auto const& config_caches : cache_simulator_->sim_caches()) {
+ const CacheConfiguration& config = config_caches.first;
+ std::string cache_label = config.cache_name + "-" +
+ std::to_string(config.num_shard_bits) + "-" +
+ std::to_string(config.ghost_cache_capacity);
+ for (uint32_t i = 0; i < config.cache_capacities.size(); i++) {
+ const std::map<uint64_t, uint64_t>& num_misses = adjust_time_unit(
+ config_caches.second[i]->miss_ratio_stats().num_misses_timeline(),
+ time_unit);
+ for (auto const& num_miss : num_misses) {
+ uint64_t time = num_miss.first;
+ start_time = std::min(start_time, time);
+ end_time = std::max(end_time, time);
+ uint64_t miss = num_miss.second;
+ cs_name_timeline[config.cache_capacities[i]][cache_label][time] = miss;
+ }
+ }
+ }
+ for (auto const& it : cs_name_timeline) {
+ const std::string output_miss_ratio_timeline_path =
+ output_dir_ + "/" + std::to_string(it.first) + "_" +
+ std::to_string(time_unit) + "_" + kFileNameSuffixMissTimeline;
+ std::ofstream out(output_miss_ratio_timeline_path);
+ if (!out.is_open()) {
+ return;
+ }
+ std::string header("time");
+ for (uint64_t now = start_time; now <= end_time; now++) {
+ header += ",";
+ header += std::to_string(now);
+ }
+ out << header << std::endl;
+ for (auto const& label : it.second) {
+ std::string row(label.first);
+ for (uint64_t now = start_time; now <= end_time; now++) {
+ auto misses = label.second.find(now);
+ row += ",";
+ if (misses != label.second.end()) {
+ row += std::to_string(misses->second);
+ } else {
+ row += "0";
+ }
+ }
+ out << row << std::endl;
+ }
+ out.close();
+ }
+}
+
+void BlockCacheTraceAnalyzer::WriteSkewness(
+ const std::string& label_str, const std::vector<uint64_t>& percent_buckets,
+ TraceType target_block_type) const {
+ std::set<std::string> labels = ParseLabelStr(label_str);
+ std::map<std::string, uint64_t> label_naccesses;
+ uint64_t total_naccesses = 0;
+ auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+ uint32_t level, TraceType type,
+ const std::string& /*block_key*/, uint64_t block_id,
+ const BlockAccessInfo& block) {
+ if (target_block_type != TraceType::kTraceMax &&
+ target_block_type != type) {
+ return;
+ }
+ const std::string label = BuildLabel(
+ labels, cf_name, fd, level, type,
+ TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
+ label_naccesses[label] += block.num_accesses;
+ total_naccesses += block.num_accesses;
+ };
+ TraverseBlocks(block_callback, &labels);
+ std::map<std::string, std::map<uint64_t, uint64_t>> label_bucket_naccesses;
+ std::vector<std::pair<std::string, uint64_t>> pairs;
+ for (auto const& itr : label_naccesses) {
+ pairs.push_back(itr);
+ }
+ // Sort in descending order.
+ sort(pairs.begin(), pairs.end(),
+ [=](const std::pair<std::string, uint64_t>& a,
+ const std::pair<std::string, uint64_t>& b) {
+ return b.second < a.second;
+ });
+
+ size_t prev_start_index = 0;
+ for (auto const& percent : percent_buckets) {
+ label_bucket_naccesses[label_str][percent] = 0;
+ size_t end_index = 0;
+ if (percent == port::kMaxUint64) {
+ end_index = label_naccesses.size();
+ } else {
+ end_index = percent * label_naccesses.size() / 100;
+ }
+ for (size_t i = prev_start_index; i < end_index; i++) {
+ label_bucket_naccesses[label_str][percent] += pairs[i].second;
+ }
+ prev_start_index = end_index;
+ }
+ std::string filename_suffix;
+ if (target_block_type != TraceType::kTraceMax) {
+ filename_suffix = block_type_to_string(target_block_type);
+ filename_suffix += "_";
+ }
+ filename_suffix += kFileNameSuffixSkew;
+ WriteStatsToFile(label_str, percent_buckets, filename_suffix,
+ label_bucket_naccesses, total_naccesses);
+}
+
+void BlockCacheTraceAnalyzer::WriteCorrelationFeatures(
+ const std::string& label_str, uint32_t max_number_of_values) const {
+ std::set<std::string> labels = ParseLabelStr(label_str);
+ std::map<std::string, Features> label_features;
+ std::map<std::string, Predictions> label_predictions;
+ auto block_callback =
+ [&](const std::string& cf_name, uint64_t fd, uint32_t level,
+ TraceType block_type, const std::string& /*block_key*/,
+ uint64_t /*block_key_id*/, const BlockAccessInfo& block) {
+ if (block.table_id == 0 && labels.find(kGroupbyTable) != labels.end()) {
+ // We only know table id information for get requests.
+ return;
+ }
+ if (labels.find(kGroupbyCaller) != labels.end()) {
+ // Group by caller.
+ for (auto const& caller_map : block.caller_access_timeline) {
+ const std::string label =
+ BuildLabel(labels, cf_name, fd, level, block_type,
+ caller_map.first, /*block_id=*/0, block);
+ auto it = block.caller_access_sequence__number_timeline.find(
+ caller_map.first);
+ assert(it != block.caller_access_sequence__number_timeline.end());
+ UpdateFeatureVectors(it->second, caller_map.second, label,
+ &label_features, &label_predictions);
+ }
+ return;
+ }
+ const std::string label =
+ BuildLabel(labels, cf_name, fd, level, block_type,
+ TableReaderCaller::kMaxBlockCacheLookupCaller,
+ /*block_id=*/0, block);
+ UpdateFeatureVectors(block.access_sequence_number_timeline,
+ block.access_timeline, label, &label_features,
+ &label_predictions);
+ };
+ TraverseBlocks(block_callback, &labels);
+ WriteCorrelationFeaturesToFile(label_str, label_features, label_predictions,
+ max_number_of_values);
+}
+
+void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesToFile(
+ const std::string& label,
+ const std::map<std::string, Features>& label_features,
+ const std::map<std::string, Predictions>& label_predictions,
+ uint32_t max_number_of_values) const {
+ std::default_random_engine rand_engine(static_cast<std::default_random_engine::result_type>(env_->NowMicros()));
+ for (auto const& label_feature_vectors : label_features) {
+ const Features& past = label_feature_vectors.second;
+ auto it = label_predictions.find(label_feature_vectors.first);
+ assert(it != label_predictions.end());
+ const Predictions& future = it->second;
+ const std::string output_path = output_dir_ + "/" + label + "_" +
+ label_feature_vectors.first + "_" +
+ kFileNameSuffixCorrelation;
+ std::ofstream out(output_path);
+ if (!out.is_open()) {
+ return;
+ }
+ std::string header(
+ "num_accesses_since_last_access,elapsed_time_since_last_access,num_"
+ "past_accesses,num_accesses_till_next_access,elapsed_time_till_next_"
+ "access");
+ out << header << std::endl;
+ std::vector<uint32_t> indexes;
+ for (uint32_t i = 0; i < past.num_accesses_since_last_access.size(); i++) {
+ indexes.push_back(i);
+ }
+ std::shuffle(indexes.begin(), indexes.end(), rand_engine);
+ for (uint32_t i = 0; i < max_number_of_values && i < indexes.size(); i++) {
+ uint32_t rand_index = indexes[i];
+ out << std::to_string(past.num_accesses_since_last_access[rand_index])
+ << ",";
+ out << std::to_string(past.elapsed_time_since_last_access[rand_index])
+ << ",";
+ out << std::to_string(past.num_past_accesses[rand_index]) << ",";
+ out << std::to_string(future.num_accesses_till_next_access[rand_index])
+ << ",";
+ out << std::to_string(future.elapsed_time_till_next_access[rand_index])
+ << std::endl;
+ }
+ out.close();
+ }
+}
+
+void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesForGet(
+ uint32_t max_number_of_values) const {
+ std::string label = "GetKeyInfo";
+ std::map<std::string, Features> label_features;
+ std::map<std::string, Predictions> label_predictions;
+ for (auto const& get_info : get_key_info_map_) {
+ const GetKeyInfo& info = get_info.second;
+ UpdateFeatureVectors(info.access_sequence_number_timeline,
+ info.access_timeline, label, &label_features,
+ &label_predictions);
+ }
+ WriteCorrelationFeaturesToFile(label, label_features, label_predictions,
+ max_number_of_values);
+}
+
+std::set<std::string> BlockCacheTraceAnalyzer::ParseLabelStr(
+ const std::string& label_str) const {
+ std::stringstream ss(label_str);
+ std::set<std::string> labels;
+ // label_str is in the form of "label1_label2_label3", e.g., cf_bt.
+ while (ss.good()) {
+ std::string label_name;
+ getline(ss, label_name, '_');
+ if (kGroupbyLabels.find(label_name) == kGroupbyLabels.end()) {
+ // Unknown label name.
+ fprintf(stderr, "Unknown label name %s, label string %s\n",
+ label_name.c_str(), label_str.c_str());
+ return {};
+ }
+ labels.insert(label_name);
+ }
+ return labels;
+}
+
+std::string BlockCacheTraceAnalyzer::BuildLabel(
+ const std::set<std::string>& labels, const std::string& cf_name,
+ uint64_t fd, uint32_t level, TraceType type, TableReaderCaller caller,
+ uint64_t block_key, const BlockAccessInfo& block) const {
+ std::map<std::string, std::string> label_value_map;
+ label_value_map[kGroupbyAll] = kGroupbyAll;
+ label_value_map[kGroupbyLevel] = std::to_string(level);
+ label_value_map[kGroupbyCaller] = caller_to_string(caller);
+ label_value_map[kGroupbySSTFile] = std::to_string(fd);
+ label_value_map[kGroupbyBlockType] = block_type_to_string(type);
+ label_value_map[kGroupbyColumnFamily] = cf_name;
+ label_value_map[kGroupbyBlock] = std::to_string(block_key);
+ label_value_map[kGroupbyTable] = std::to_string(block.table_id);
+ // Concatenate the label values.
+ std::string label;
+ for (auto const& l : labels) {
+ label += label_value_map[l];
+ label += "-";
+ }
+ if (!label.empty()) {
+ label.pop_back();
+ }
+ return label;
+}
+
+void BlockCacheTraceAnalyzer::TraverseBlocks(
+ std::function<void(const std::string& /*cf_name*/, uint64_t /*fd*/,
+ uint32_t /*level*/, TraceType /*block_type*/,
+ const std::string& /*block_key*/,
+ uint64_t /*block_key_id*/,
+ const BlockAccessInfo& /*block_access_info*/)>
+ block_callback,
+ std::set<std::string>* labels) const {
+ for (auto const& cf_aggregates : cf_aggregates_map_) {
+ // Stats per column family.
+ const std::string& cf_name = cf_aggregates.first;
+ for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+ // Stats per SST file.
+ const uint64_t fd = file_aggregates.first;
+ const uint32_t level = file_aggregates.second.level;
+ for (auto const& block_type_aggregates :
+ file_aggregates.second.block_type_aggregates_map) {
+ // Stats per block type.
+ const TraceType type = block_type_aggregates.first;
+ for (auto const& block_access_info :
+ block_type_aggregates.second.block_access_info_map) {
+ // Stats per block.
+ if (labels && block_access_info.second.table_id == 0 &&
+ labels->find(kGroupbyTable) != labels->end()) {
+ // We only know table id information for get requests.
+ return;
+ }
+ block_callback(cf_name, fd, level, type, block_access_info.first,
+ block_access_info.second.block_id,
+ block_access_info.second);
+ }
+ }
+ }
+ }
+}
+
+void BlockCacheTraceAnalyzer::WriteGetSpatialLocality(
+ const std::string& label_str,
+ const std::vector<uint64_t>& percent_buckets) const {
+ std::set<std::string> labels = ParseLabelStr(label_str);
+ std::map<std::string, std::map<uint64_t, uint64_t>> label_pnrefkeys_nblocks;
+ std::map<std::string, std::map<uint64_t, uint64_t>> label_pnrefs_nblocks;
+ std::map<std::string, std::map<uint64_t, uint64_t>> label_pndatasize_nblocks;
+ uint64_t nblocks = 0;
+ auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+ uint32_t level, TraceType /*block_type*/,
+ const std::string& /*block_key*/,
+ uint64_t /*block_key_id*/,
+ const BlockAccessInfo& block) {
+ if (block.num_keys == 0) {
+ return;
+ }
+ uint64_t naccesses = 0;
+ for (auto const& key_access : block.key_num_access_map) {
+ for (auto const& caller_access : key_access.second) {
+ if (caller_access.first == TableReaderCaller::kUserGet) {
+ naccesses += caller_access.second;
+ }
+ }
+ }
+ const std::string label =
+ BuildLabel(labels, cf_name, fd, level, TraceType::kBlockTraceDataBlock,
+ TableReaderCaller::kUserGet, /*block_id=*/0, block);
+
+ const uint64_t percent_referenced_for_existing_keys =
+ static_cast<uint64_t>(std::max(
+ percent(block.key_num_access_map.size(), block.num_keys), 0.0));
+ const uint64_t percent_accesses_for_existing_keys =
+ static_cast<uint64_t>(std::max(
+ percent(block.num_referenced_key_exist_in_block, naccesses), 0.0));
+ const uint64_t percent_referenced_data_size = static_cast<uint64_t>(
+ std::max(percent(block.referenced_data_size, block.block_size), 0.0));
+ if (label_pnrefkeys_nblocks.find(label) == label_pnrefkeys_nblocks.end()) {
+ for (auto const& percent_bucket : percent_buckets) {
+ label_pnrefkeys_nblocks[label][percent_bucket] = 0;
+ label_pnrefs_nblocks[label][percent_bucket] = 0;
+ label_pndatasize_nblocks[label][percent_bucket] = 0;
+ }
+ }
+ label_pnrefkeys_nblocks[label]
+ .upper_bound(percent_referenced_for_existing_keys)
+ ->second += 1;
+ label_pnrefs_nblocks[label]
+ .upper_bound(percent_accesses_for_existing_keys)
+ ->second += 1;
+ label_pndatasize_nblocks[label]
+ .upper_bound(percent_referenced_data_size)
+ ->second += 1;
+ nblocks += 1;
+ };
+ TraverseBlocks(block_callback, &labels);
+ WriteStatsToFile(label_str, percent_buckets, kFileNameSuffixPercentRefKeys,
+ label_pnrefkeys_nblocks, nblocks);
+ WriteStatsToFile(label_str, percent_buckets,
+ kFileNameSuffixPercentAccessesOnRefKeys,
+ label_pnrefs_nblocks, nblocks);
+ WriteStatsToFile(label_str, percent_buckets,
+ kFileNameSuffixPercentDataSizeOnRefKeys,
+ label_pndatasize_nblocks, nblocks);
+}
+
+void BlockCacheTraceAnalyzer::WriteAccessTimeline(const std::string& label_str,
+ uint64_t time_unit,
+ bool user_access_only) const {
+ std::set<std::string> labels = ParseLabelStr(label_str);
+ uint64_t start_time = port::kMaxUint64;
+ uint64_t end_time = 0;
+ std::map<std::string, std::map<uint64_t, uint64_t>> label_access_timeline;
+ std::map<uint64_t, std::vector<std::string>> access_count_block_id_map;
+
+ auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+ uint32_t level, TraceType type,
+ const std::string& /*block_key*/, uint64_t block_id,
+ const BlockAccessInfo& block) {
+ uint64_t naccesses = 0;
+ for (auto const& timeline : block.caller_num_accesses_timeline) {
+ const TableReaderCaller caller = timeline.first;
+ if (user_access_only && !is_user_access(caller)) {
+ continue;
+ }
+ const std::string label =
+ BuildLabel(labels, cf_name, fd, level, type, caller, block_id, block);
+ for (auto const& naccess : timeline.second) {
+ const uint64_t timestamp = naccess.first / time_unit;
+ const uint64_t num = naccess.second;
+ label_access_timeline[label][timestamp] += num;
+ start_time = std::min(start_time, timestamp);
+ end_time = std::max(end_time, timestamp);
+ naccesses += num;
+ }
+ }
+ if (naccesses > 0) {
+ access_count_block_id_map[naccesses].push_back(std::to_string(block_id));
+ }
+ };
+ TraverseBlocks(block_callback, &labels);
+
+ // We have label_access_timeline now. Write them into a file.
+ const std::string user_access_prefix =
+ user_access_only ? "user_access_only_" : "all_access_";
+ const std::string output_path = output_dir_ + "/" + user_access_prefix +
+ label_str + "_" + std::to_string(time_unit) +
+ "_" + kFileNameSuffixAccessTimeline;
+ std::ofstream out(output_path);
+ if (!out.is_open()) {
+ return;
+ }
+ std::string header("time");
+ if (labels.find("block") != labels.end()) {
+ for (uint64_t now = start_time; now <= end_time; now++) {
+ header += ",";
+ header += std::to_string(now);
+ }
+ out << header << std::endl;
+ // Write the most frequently accessed blocks first.
+ for (auto naccess_it = access_count_block_id_map.rbegin();
+ naccess_it != access_count_block_id_map.rend(); naccess_it++) {
+ for (auto& block_id_it : naccess_it->second) {
+ std::string row(block_id_it);
+ for (uint64_t now = start_time; now <= end_time; now++) {
+ auto it = label_access_timeline[block_id_it].find(now);
+ row += ",";
+ if (it != label_access_timeline[block_id_it].end()) {
+ row += std::to_string(it->second);
+ } else {
+ row += "0";
+ }
+ }
+ out << row << std::endl;
+ }
+ }
+ out.close();
+ return;
+ }
+ for (uint64_t now = start_time; now <= end_time; now++) {
+ header += ",";
+ header += std::to_string(now);
+ }
+ out << header << std::endl;
+ for (auto const& label : label_access_timeline) {
+ std::string row(label.first);
+ for (uint64_t now = start_time; now <= end_time; now++) {
+ auto it = label.second.find(now);
+ row += ",";
+ if (it != label.second.end()) {
+ row += std::to_string(it->second);
+ } else {
+ row += "0";
+ }
+ }
+ out << row << std::endl;
+ }
+
+ out.close();
+}
+
+void BlockCacheTraceAnalyzer::WriteReuseDistance(
+ const std::string& label_str,
+ const std::vector<uint64_t>& distance_buckets) const {
+ std::set<std::string> labels = ParseLabelStr(label_str);
+ std::map<std::string, std::map<uint64_t, uint64_t>> label_distance_num_reuses;
+ uint64_t total_num_reuses = 0;
+ auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+ uint32_t level, TraceType type,
+ const std::string& /*block_key*/, uint64_t block_id,
+ const BlockAccessInfo& block) {
+ const std::string label = BuildLabel(
+ labels, cf_name, fd, level, type,
+ TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
+ if (label_distance_num_reuses.find(label) ==
+ label_distance_num_reuses.end()) {
+ // The first time we encounter this label.
+ for (auto const& distance_bucket : distance_buckets) {
+ label_distance_num_reuses[label][distance_bucket] = 0;
+ }
+ }
+ for (auto const& reuse_distance : block.reuse_distance_count) {
+ label_distance_num_reuses[label]
+ .upper_bound(reuse_distance.first)
+ ->second += reuse_distance.second;
+ total_num_reuses += reuse_distance.second;
+ }
+ };
+ TraverseBlocks(block_callback, &labels);
+ // We have label_naccesses and label_distance_num_reuses now. Write them into
+ // a file.
+ const std::string output_path =
+ output_dir_ + "/" + label_str + "_reuse_distance";
+ std::ofstream out(output_path);
+ if (!out.is_open()) {
+ return;
+ }
+ std::string header("bucket");
+ for (auto const& label_it : label_distance_num_reuses) {
+ header += ",";
+ header += label_it.first;
+ }
+ out << header << std::endl;
+ for (auto const& bucket : distance_buckets) {
+ std::string row(std::to_string(bucket));
+ for (auto const& label_it : label_distance_num_reuses) {
+ auto const& it = label_it.second.find(bucket);
+ assert(it != label_it.second.end());
+ row += ",";
+ row += std::to_string(percent(it->second, total_num_reuses));
+ }
+ out << row << std::endl;
+ }
+ out.close();
+}
+
+void BlockCacheTraceAnalyzer::UpdateReuseIntervalStats(
+ const std::string& label, const std::vector<uint64_t>& time_buckets,
+ const std::map<uint64_t, uint64_t> timeline,
+ std::map<std::string, std::map<uint64_t, uint64_t>>* label_time_num_reuses,
+ uint64_t* total_num_reuses) const {
+ assert(label_time_num_reuses);
+ assert(total_num_reuses);
+ if (label_time_num_reuses->find(label) == label_time_num_reuses->end()) {
+ // The first time we encounter this label.
+ for (auto const& time_bucket : time_buckets) {
+ (*label_time_num_reuses)[label][time_bucket] = 0;
+ }
+ }
+ auto it = timeline.begin();
+ uint64_t prev_timestamp = it->first;
+ const uint64_t prev_num = it->second;
+ it++;
+ // Reused within one second.
+ if (prev_num > 1) {
+ (*label_time_num_reuses)[label].upper_bound(0)->second += prev_num - 1;
+ *total_num_reuses += prev_num - 1;
+ }
+ while (it != timeline.end()) {
+ const uint64_t timestamp = it->first;
+ const uint64_t num = it->second;
+ const uint64_t reuse_interval = timestamp - prev_timestamp;
+ (*label_time_num_reuses)[label].upper_bound(reuse_interval)->second += 1;
+ if (num > 1) {
+ (*label_time_num_reuses)[label].upper_bound(0)->second += num - 1;
+ }
+ prev_timestamp = timestamp;
+ *total_num_reuses += num;
+ it++;
+ }
+}
+
+void BlockCacheTraceAnalyzer::WriteStatsToFile(
+ const std::string& label_str, const std::vector<uint64_t>& time_buckets,
+ const std::string& filename_suffix,
+ const std::map<std::string, std::map<uint64_t, uint64_t>>& label_data,
+ uint64_t ntotal) const {
+ const std::string output_path =
+ output_dir_ + "/" + label_str + "_" + filename_suffix;
+ std::ofstream out(output_path);
+ if (!out.is_open()) {
+ return;
+ }
+ std::string header("bucket");
+ for (auto const& label_it : label_data) {
+ header += ",";
+ header += label_it.first;
+ }
+ out << header << std::endl;
+ for (auto const& bucket : time_buckets) {
+ std::string row(std::to_string(bucket));
+ for (auto const& label_it : label_data) {
+ auto const& it = label_it.second.find(bucket);
+ assert(it != label_it.second.end());
+ row += ",";
+ row += std::to_string(percent(it->second, ntotal));
+ }
+ out << row << std::endl;
+ }
+ out.close();
+}
+
+void BlockCacheTraceAnalyzer::WriteReuseInterval(
+ const std::string& label_str,
+ const std::vector<uint64_t>& time_buckets) const {
+ std::set<std::string> labels = ParseLabelStr(label_str);
+ std::map<std::string, std::map<uint64_t, uint64_t>> label_time_num_reuses;
+ std::map<std::string, std::map<uint64_t, uint64_t>> label_avg_reuse_nblocks;
+ std::map<std::string, std::map<uint64_t, uint64_t>> label_avg_reuse_naccesses;
+
+ uint64_t total_num_reuses = 0;
+ uint64_t total_nblocks = 0;
+ uint64_t total_accesses = 0;
+ auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+ uint32_t level, TraceType type,
+ const std::string& /*block_key*/, uint64_t block_id,
+ const BlockAccessInfo& block) {
+ total_nblocks++;
+ total_accesses += block.num_accesses;
+ uint64_t avg_reuse_interval = 0;
+ if (block.num_accesses > 1) {
+ avg_reuse_interval = ((block.last_access_time - block.first_access_time) /
+ kMicrosInSecond) /
+ block.num_accesses;
+ } else {
+ avg_reuse_interval = port::kMaxUint64 - 1;
+ }
+ if (labels.find(kGroupbyCaller) != labels.end()) {
+ for (auto const& timeline : block.caller_num_accesses_timeline) {
+ const TableReaderCaller caller = timeline.first;
+ const std::string label = BuildLabel(labels, cf_name, fd, level, type,
+ caller, block_id, block);
+ UpdateReuseIntervalStats(label, time_buckets, timeline.second,
+ &label_time_num_reuses, &total_num_reuses);
+ }
+ return;
+ }
+ // Does not group by caller so we need to flatten the access timeline.
+ const std::string label = BuildLabel(
+ labels, cf_name, fd, level, type,
+ TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
+ std::map<uint64_t, uint64_t> timeline;
+ for (auto const& caller_timeline : block.caller_num_accesses_timeline) {
+ for (auto const& time_naccess : caller_timeline.second) {
+ timeline[time_naccess.first] += time_naccess.second;
+ }
+ }
+ UpdateReuseIntervalStats(label, time_buckets, timeline,
+ &label_time_num_reuses, &total_num_reuses);
+ if (label_avg_reuse_nblocks.find(label) == label_avg_reuse_nblocks.end()) {
+ for (auto const& time_bucket : time_buckets) {
+ label_avg_reuse_nblocks[label][time_bucket] = 0;
+ label_avg_reuse_naccesses[label][time_bucket] = 0;
+ }
+ }
+ label_avg_reuse_nblocks[label].upper_bound(avg_reuse_interval)->second += 1;
+ label_avg_reuse_naccesses[label].upper_bound(avg_reuse_interval)->second +=
+ block.num_accesses;
+ };
+ TraverseBlocks(block_callback, &labels);
+
+ // Write the stats into files.
+ WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseInterval,
+ label_time_num_reuses, total_num_reuses);
+ WriteStatsToFile(label_str, time_buckets, kFileNameSuffixAvgReuseInterval,
+ label_avg_reuse_nblocks, total_nblocks);
+ WriteStatsToFile(label_str, time_buckets,
+ kFileNameSuffixAvgReuseIntervalNaccesses,
+ label_avg_reuse_naccesses, total_accesses);
+}
+
+void BlockCacheTraceAnalyzer::WriteReuseLifetime(
+ const std::string& label_str,
+ const std::vector<uint64_t>& time_buckets) const {
+ std::set<std::string> labels = ParseLabelStr(label_str);
+ std::map<std::string, std::map<uint64_t, uint64_t>> label_lifetime_nblocks;
+ uint64_t total_nblocks = 0;
+ auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+ uint32_t level, TraceType type,
+ const std::string& /*block_key*/, uint64_t block_id,
+ const BlockAccessInfo& block) {
+ uint64_t lifetime = 0;
+ if (block.num_accesses > 1) {
+ lifetime =
+ (block.last_access_time - block.first_access_time) / kMicrosInSecond;
+ } else {
+ lifetime = port::kMaxUint64 - 1;
+ }
+ const std::string label = BuildLabel(
+ labels, cf_name, fd, level, type,
+ TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
+
+ if (label_lifetime_nblocks.find(label) == label_lifetime_nblocks.end()) {
+ // The first time we encounter this label.
+ for (auto const& time_bucket : time_buckets) {
+ label_lifetime_nblocks[label][time_bucket] = 0;
+ }
+ }
+ label_lifetime_nblocks[label].upper_bound(lifetime)->second += 1;
+ total_nblocks += 1;
+ };
+ TraverseBlocks(block_callback, &labels);
+ WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseLifetime,
+ label_lifetime_nblocks, total_nblocks);
+}
+
+void BlockCacheTraceAnalyzer::WriteBlockReuseTimeline(
+ const uint64_t reuse_window, bool user_access_only, TraceType block_type) const {
+ // A map from block key to an array of bools that states whether a block is
+ // accessed in a time window.
+ std::map<uint64_t, std::vector<bool>> block_accessed;
+ const uint64_t trace_duration =
+ trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+ const uint64_t reuse_vector_size = (trace_duration / reuse_window);
+ if (reuse_vector_size < 2) {
+ // The reuse window is less than 2. We cannot calculate the reused
+ // percentage of blocks.
+ return;
+ }
+ auto block_callback = [&](const std::string& /*cf_name*/, uint64_t /*fd*/,
+ uint32_t /*level*/, TraceType /*type*/,
+ const std::string& /*block_key*/, uint64_t block_id,
+ const BlockAccessInfo& block) {
+ if (block_accessed.find(block_id) == block_accessed.end()) {
+ block_accessed[block_id].resize(reuse_vector_size);
+ for (uint64_t i = 0; i < reuse_vector_size; i++) {
+ block_accessed[block_id][i] = false;
+ }
+ }
+ for (auto const& caller_num : block.caller_num_accesses_timeline) {
+ const TableReaderCaller caller = caller_num.first;
+ for (auto const& timeline : caller_num.second) {
+ const uint64_t timestamp = timeline.first;
+ const uint64_t elapsed_time =
+ timestamp - trace_start_timestamp_in_seconds_;
+ if (!user_access_only || is_user_access(caller)) {
+ uint64_t index =
+ std::min(elapsed_time / reuse_window, reuse_vector_size - 1);
+ block_accessed[block_id][index] = true;
+ }
+ }
+ }
+ };
+ TraverseBlocks(block_callback);
+
+ // A cell is the number of blocks accessed in a reuse window.
+ std::unique_ptr<uint64_t[]> reuse_table(new uint64_t[reuse_vector_size * reuse_vector_size]);
+ for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) {
+ // Initialize the reuse_table.
+ for (uint64_t i = 0; i < reuse_vector_size; i++) {
+ reuse_table[start_time * reuse_vector_size + i] = 0;
+ }
+ // Examine all blocks.
+ for (auto const& block : block_accessed) {
+ for (uint64_t i = start_time; i < reuse_vector_size; i++) {
+ if (block.second[start_time] && block.second[i]) {
+ // This block is accessed at start time and at the current time. We
+ // increment reuse_table[start_time][i] since it is reused at the ith
+ // window.
+ reuse_table[start_time * reuse_vector_size + i]++;
+ }
+ }
+ }
+ }
+ const std::string user_access_prefix =
+ user_access_only ? "_user_access_only_" : "_all_access_";
+ const std::string output_path =
+ output_dir_ + "/" + block_type_to_string(block_type) +
+ user_access_prefix + std::to_string(reuse_window) + "_" +
+ kFileNameSuffixAccessReuseBlocksTimeline;
+ std::ofstream out(output_path);
+ if (!out.is_open()) {
+ return;
+ }
+ std::string header("start_time");
+ for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) {
+ header += ",";
+ header += std::to_string(start_time);
+ }
+ out << header << std::endl;
+ for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) {
+ std::string row(std::to_string(start_time * reuse_window));
+ for (uint64_t j = 0; j < reuse_vector_size; j++) {
+ row += ",";
+ if (j < start_time) {
+ row += "100.0";
+ } else {
+ row += std::to_string(percent(reuse_table[start_time * reuse_vector_size + j],
+ reuse_table[start_time * reuse_vector_size + start_time]));
+ }
+ }
+ out << row << std::endl;
+ }
+ out.close();
+}
+
+std::string BlockCacheTraceAnalyzer::OutputPercentAccessStats(
+ uint64_t total_accesses,
+ const std::map<std::string, uint64_t>& cf_access_count) const {
+ std::string row;
+ for (auto const& cf_aggregates : cf_aggregates_map_) {
+ const std::string& cf_name = cf_aggregates.first;
+ const auto& naccess = cf_access_count.find(cf_name);
+ row += ",";
+ if (naccess != cf_access_count.end()) {
+ row += std::to_string(percent(naccess->second, total_accesses));
+ } else {
+ row += "0";
+ }
+ }
+ return row;
+}
+
+void BlockCacheTraceAnalyzer::WritePercentAccessSummaryStats() const {
+ std::map<TableReaderCaller, std::map<std::string, uint64_t>>
+ caller_cf_accesses;
+ uint64_t total_accesses = 0;
+ auto block_callback =
+ [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+ TraceType /*type*/, const std::string& /*block_key*/,
+ uint64_t /*block_id*/, const BlockAccessInfo& block) {
+ for (auto const& caller_num : block.caller_num_access_map) {
+ const TableReaderCaller caller = caller_num.first;
+ const uint64_t naccess = caller_num.second;
+ caller_cf_accesses[caller][cf_name] += naccess;
+ total_accesses += naccess;
+ }
+ };
+ TraverseBlocks(block_callback);
+
+ const std::string output_path =
+ output_dir_ + "/" + kFileNameSuffixPercentOfAccessSummary;
+ std::ofstream out(output_path);
+ if (!out.is_open()) {
+ return;
+ }
+ std::string header("caller");
+ for (auto const& cf_name : cf_aggregates_map_) {
+ header += ",";
+ header += cf_name.first;
+ }
+ out << header << std::endl;
+ for (auto const& cf_naccess_it : caller_cf_accesses) {
+ const TableReaderCaller caller = cf_naccess_it.first;
+ std::string row;
+ row += caller_to_string(caller);
+ row += OutputPercentAccessStats(total_accesses, cf_naccess_it.second);
+ out << row << std::endl;
+ }
+ out.close();
+}
+
+void BlockCacheTraceAnalyzer::WriteDetailedPercentAccessSummaryStats(
+ TableReaderCaller analyzing_caller) const {
+ std::map<uint32_t, std::map<std::string, uint64_t>> level_cf_accesses;
+ std::map<TraceType, std::map<std::string, uint64_t>> bt_cf_accesses;
+ uint64_t total_accesses = 0;
+ auto block_callback =
+ [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t level,
+ TraceType type, const std::string& /*block_key*/,
+ uint64_t /*block_id*/, const BlockAccessInfo& block) {
+ for (auto const& caller_num : block.caller_num_access_map) {
+ const TableReaderCaller caller = caller_num.first;
+ if (caller == analyzing_caller) {
+ const uint64_t naccess = caller_num.second;
+ level_cf_accesses[level][cf_name] += naccess;
+ bt_cf_accesses[type][cf_name] += naccess;
+ total_accesses += naccess;
+ }
+ }
+ };
+ TraverseBlocks(block_callback);
+ {
+ const std::string output_path =
+ output_dir_ + "/" + caller_to_string(analyzing_caller) + "_level_" +
+ kFileNameSuffixPercentOfAccessSummary;
+ std::ofstream out(output_path);
+ if (!out.is_open()) {
+ return;
+ }
+ std::string header("level");
+ for (auto const& cf_name : cf_aggregates_map_) {
+ header += ",";
+ header += cf_name.first;
+ }
+ out << header << std::endl;
+ for (auto const& level_naccess_it : level_cf_accesses) {
+ const uint32_t level = level_naccess_it.first;
+ std::string row;
+ row += std::to_string(level);
+ row += OutputPercentAccessStats(total_accesses, level_naccess_it.second);
+ out << row << std::endl;
+ }
+ out.close();
+ }
+ {
+ const std::string output_path =
+ output_dir_ + "/" + caller_to_string(analyzing_caller) + "_bt_" +
+ kFileNameSuffixPercentOfAccessSummary;
+ std::ofstream out(output_path);
+ if (!out.is_open()) {
+ return;
+ }
+ std::string header("bt");
+ for (auto const& cf_name : cf_aggregates_map_) {
+ header += ",";
+ header += cf_name.first;
+ }
+ out << header << std::endl;
+ for (auto const& bt_naccess_it : bt_cf_accesses) {
+ const TraceType bt = bt_naccess_it.first;
+ std::string row;
+ row += block_type_to_string(bt);
+ row += OutputPercentAccessStats(total_accesses, bt_naccess_it.second);
+ out << row << std::endl;
+ }
+ out.close();
+ }
+}
+
+void BlockCacheTraceAnalyzer::WriteAccessCountSummaryStats(
+ const std::vector<uint64_t>& access_count_buckets,
+ bool user_access_only) const {
+ // x: buckets.
+ // y: # of accesses.
+ std::map<std::string, std::map<uint64_t, uint64_t>> bt_access_nblocks;
+ std::map<std::string, std::map<uint64_t, uint64_t>> cf_access_nblocks;
+ uint64_t total_nblocks = 0;
+ auto block_callback =
+ [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+ TraceType type, const std::string& /*block_key*/,
+ uint64_t /*block_id*/, const BlockAccessInfo& block) {
+ const std::string type_str = block_type_to_string(type);
+ if (cf_access_nblocks.find(cf_name) == cf_access_nblocks.end()) {
+ // initialize.
+ for (auto& access : access_count_buckets) {
+ cf_access_nblocks[cf_name][access] = 0;
+ }
+ }
+ if (bt_access_nblocks.find(type_str) == bt_access_nblocks.end()) {
+ // initialize.
+ for (auto& access : access_count_buckets) {
+ bt_access_nblocks[type_str][access] = 0;
+ }
+ }
+ uint64_t naccesses = 0;
+ for (auto const& caller_access : block.caller_num_access_map) {
+ if (!user_access_only || is_user_access(caller_access.first)) {
+ naccesses += caller_access.second;
+ }
+ }
+ if (naccesses == 0) {
+ return;
+ }
+ total_nblocks += 1;
+ bt_access_nblocks[type_str].upper_bound(naccesses)->second += 1;
+ cf_access_nblocks[cf_name].upper_bound(naccesses)->second += 1;
+ };
+ TraverseBlocks(block_callback);
+ const std::string user_access_prefix =
+ user_access_only ? "user_access_only_" : "all_access_";
+ WriteStatsToFile("cf", access_count_buckets,
+ user_access_prefix + kFileNameSuffixAccessCountSummary,
+ cf_access_nblocks, total_nblocks);
+ WriteStatsToFile("bt", access_count_buckets,
+ user_access_prefix + kFileNameSuffixAccessCountSummary,
+ bt_access_nblocks, total_nblocks);
+}
+
+BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer(
+ const std::string& trace_file_path, const std::string& output_dir,
+ const std::string& human_readable_trace_file_path,
+ bool compute_reuse_distance, bool mrc_only,
+ bool is_human_readable_trace_file,
+ std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator)
+ : env_(ROCKSDB_NAMESPACE::Env::Default()),
+ trace_file_path_(trace_file_path),
+ output_dir_(output_dir),
+ human_readable_trace_file_path_(human_readable_trace_file_path),
+ compute_reuse_distance_(compute_reuse_distance),
+ mrc_only_(mrc_only),
+ is_human_readable_trace_file_(is_human_readable_trace_file),
+ cache_simulator_(std::move(cache_simulator)) {}
+
+void BlockCacheTraceAnalyzer::ComputeReuseDistance(
+ BlockAccessInfo* info) const {
+ assert(info);
+ if (info->num_accesses == 0) {
+ return;
+ }
+ uint64_t reuse_distance = 0;
+ for (auto const& block_key : info->unique_blocks_since_last_access) {
+ auto const& it = block_info_map_.find(block_key);
+ // This block must exist.
+ assert(it != block_info_map_.end());
+ reuse_distance += it->second->block_size;
+ }
+ info->reuse_distance_count[reuse_distance] += 1;
+ // We clear this hash set since this is the second access on this block.
+ info->unique_blocks_since_last_access.clear();
+}
+
+Status BlockCacheTraceAnalyzer::RecordAccess(
+ const BlockCacheTraceRecord& access) {
+ ColumnFamilyAccessInfoAggregate& cf_aggr = cf_aggregates_map_[access.cf_name];
+ SSTFileAccessInfoAggregate& file_aggr =
+ cf_aggr.fd_aggregates_map[access.sst_fd_number];
+ file_aggr.level = access.level;
+ BlockTypeAccessInfoAggregate& block_type_aggr =
+ file_aggr.block_type_aggregates_map[access.block_type];
+ if (block_type_aggr.block_access_info_map.find(access.block_key) ==
+ block_type_aggr.block_access_info_map.end()) {
+ block_type_aggr.block_access_info_map[access.block_key].block_id =
+ unique_block_id_;
+ unique_block_id_++;
+ }
+ BlockAccessInfo& block_access_info =
+ block_type_aggr.block_access_info_map[access.block_key];
+ if (compute_reuse_distance_) {
+ ComputeReuseDistance(&block_access_info);
+ }
+ block_access_info.AddAccess(access, access_sequence_number_);
+ block_info_map_[access.block_key] = &block_access_info;
+ uint64_t get_key_id = 0;
+ if (access.caller == TableReaderCaller::kUserGet &&
+ access.get_id != BlockCacheTraceHelper::kReservedGetId) {
+ std::string user_key = ExtractUserKey(access.referenced_key).ToString();
+ if (get_key_info_map_.find(user_key) == get_key_info_map_.end()) {
+ get_key_info_map_[user_key].key_id = unique_get_key_id_;
+ unique_get_key_id_++;
+ }
+ get_key_id = get_key_info_map_[user_key].key_id;
+ get_key_info_map_[user_key].AddAccess(access, access_sequence_number_);
+ }
+
+ if (compute_reuse_distance_) {
+ // Add this block to all existing blocks.
+ for (auto& cf_aggregates : cf_aggregates_map_) {
+ for (auto& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+ for (auto& block_type_aggregates :
+ file_aggregates.second.block_type_aggregates_map) {
+ for (auto& existing_block :
+ block_type_aggregates.second.block_access_info_map) {
+ existing_block.second.unique_blocks_since_last_access.insert(
+ access.block_key);
+ }
+ }
+ }
+ }
+ }
+ return human_readable_trace_writer_.WriteHumanReadableTraceRecord(
+ access, block_access_info.block_id, get_key_id);
+}
+
+Status BlockCacheTraceAnalyzer::Analyze() {
+ std::unique_ptr<BlockCacheTraceReader> reader;
+ Status s = Status::OK();
+ if (is_human_readable_trace_file_) {
+ reader.reset(new BlockCacheHumanReadableTraceReader(trace_file_path_));
+ } else {
+ std::unique_ptr<TraceReader> trace_reader;
+ s = NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader);
+ if (!s.ok()) {
+ return s;
+ }
+ reader.reset(new BlockCacheTraceReader(std::move(trace_reader)));
+ s = reader->ReadHeader(&header_);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ if (!human_readable_trace_file_path_.empty()) {
+ s = human_readable_trace_writer_.NewWritableFile(
+ human_readable_trace_file_path_, env_);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ uint64_t start = env_->NowMicros();
+ uint64_t time_interval = 0;
+ while (s.ok()) {
+ BlockCacheTraceRecord access;
+ s = reader->ReadAccess(&access);
+ if (!s.ok()) {
+ break;
+ }
+ if (!mrc_only_) {
+ s = RecordAccess(access);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ if (trace_start_timestamp_in_seconds_ == 0) {
+ trace_start_timestamp_in_seconds_ =
+ access.access_timestamp / kMicrosInSecond;
+ }
+ trace_end_timestamp_in_seconds_ = access.access_timestamp / kMicrosInSecond;
+ miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
+ is_user_access(access.caller),
+ access.is_cache_hit == Boolean::kFalse);
+ if (cache_simulator_) {
+ cache_simulator_->Access(access);
+ }
+ access_sequence_number_++;
+ uint64_t now = env_->NowMicros();
+ uint64_t duration = (now - start) / kMicrosInSecond;
+ if (duration > 10 * time_interval) {
+ uint64_t trace_duration =
+ trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+ fprintf(stdout,
+ "Running for %" PRIu64 " seconds: Processed %" PRIu64
+ " records/second. Trace duration %" PRIu64
+ " seconds. Observed miss ratio %.2f\n",
+ duration, duration > 0 ? access_sequence_number_ / duration : 0,
+ trace_duration, miss_ratio_stats_.miss_ratio());
+ time_interval++;
+ }
+ }
+ uint64_t now = env_->NowMicros();
+ uint64_t duration = (now - start) / kMicrosInSecond;
+ uint64_t trace_duration =
+ trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+ fprintf(stdout,
+ "Running for %" PRIu64 " seconds: Processed %" PRIu64
+ " records/second. Trace duration %" PRIu64
+ " seconds. Observed miss ratio %.2f\n",
+ duration, duration > 0 ? access_sequence_number_ / duration : 0,
+ trace_duration, miss_ratio_stats_.miss_ratio());
+ return s;
+}
+
+void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const {
+ HistogramStat bs_stats;
+ std::map<TraceType, HistogramStat> bt_stats_map;
+ std::map<std::string, std::map<TraceType, HistogramStat>> cf_bt_stats_map;
+ auto block_callback =
+ [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+ TraceType type, const std::string& /*block_key*/,
+ uint64_t /*block_id*/, const BlockAccessInfo& block) {
+ if (block.block_size == 0) {
+ // Block size may be 0 when 1) compaction observes a cache miss and
+ // does not insert the missing block into the cache again. 2)
+ // fetching filter blocks in SST files at the last level.
+ return;
+ }
+ bs_stats.Add(block.block_size);
+ bt_stats_map[type].Add(block.block_size);
+ cf_bt_stats_map[cf_name][type].Add(block.block_size);
+ };
+ TraverseBlocks(block_callback);
+ fprintf(stdout, "Block size stats: \n%s", bs_stats.ToString().c_str());
+ for (auto const& bt_stats : bt_stats_map) {
+ print_break_lines(/*num_break_lines=*/1);
+ fprintf(stdout, "Block size stats for block type %s: \n%s",
+ block_type_to_string(bt_stats.first).c_str(),
+ bt_stats.second.ToString().c_str());
+ }
+ for (auto const& cf_bt_stats : cf_bt_stats_map) {
+ const std::string& cf_name = cf_bt_stats.first;
+ for (auto const& bt_stats : cf_bt_stats.second) {
+ print_break_lines(/*num_break_lines=*/1);
+ fprintf(stdout,
+ "Block size stats for column family %s and block type %s: \n%s",
+ cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(),
+ bt_stats.second.ToString().c_str());
+ }
+ }
+}
+
+void BlockCacheTraceAnalyzer::PrintAccessCountStats(bool user_access_only,
+ uint32_t bottom_k,
+ uint32_t top_k) const {
+ HistogramStat access_stats;
+ std::map<TraceType, HistogramStat> bt_stats_map;
+ std::map<std::string, std::map<TraceType, HistogramStat>> cf_bt_stats_map;
+ std::map<uint64_t, std::vector<std::string>> access_count_blocks;
+ auto block_callback = [&](const std::string& cf_name, uint64_t /*fd*/,
+ uint32_t /*level*/, TraceType type,
+ const std::string& block_key, uint64_t /*block_id*/,
+ const BlockAccessInfo& block) {
+ uint64_t naccesses = 0;
+ for (auto const& caller_access : block.caller_num_access_map) {
+ if (!user_access_only || is_user_access(caller_access.first)) {
+ naccesses += caller_access.second;
+ }
+ }
+ if (naccesses == 0) {
+ return;
+ }
+ if (type == TraceType::kBlockTraceDataBlock) {
+ access_count_blocks[naccesses].push_back(block_key);
+ }
+ access_stats.Add(naccesses);
+ bt_stats_map[type].Add(naccesses);
+ cf_bt_stats_map[cf_name][type].Add(naccesses);
+ };
+ TraverseBlocks(block_callback);
+ fprintf(stdout,
+ "Block access count stats: The number of accesses per block. %s\n%s",
+ user_access_only ? "User accesses only" : "All accesses",
+ access_stats.ToString().c_str());
+ uint32_t bottom_k_index = 0;
+ for (auto naccess_it = access_count_blocks.begin();
+ naccess_it != access_count_blocks.end(); naccess_it++) {
+ bottom_k_index++;
+ if (bottom_k_index >= bottom_k) {
+ break;
+ }
+ std::map<TableReaderCaller, uint64_t> caller_naccesses;
+ uint64_t naccesses = 0;
+ for (auto const& block_id : naccess_it->second) {
+ BlockAccessInfo* block = block_info_map_.find(block_id)->second;
+ for (auto const& caller_access : block->caller_num_access_map) {
+ if (!user_access_only || is_user_access(caller_access.first)) {
+ caller_naccesses[caller_access.first] += caller_access.second;
+ naccesses += caller_access.second;
+ }
+ }
+ }
+ std::string statistics("Caller:");
+ for (auto const& caller_naccessess_it : caller_naccesses) {
+ statistics += caller_to_string(caller_naccessess_it.first);
+ statistics += ":";
+ statistics +=
+ std::to_string(percent(caller_naccessess_it.second, naccesses));
+ statistics += ",";
+ }
+ fprintf(stdout,
+ "Bottom %" PRIu32 " access count. Access count=%" PRIu64
+ " nblocks=%" ROCKSDB_PRIszt " %s\n",
+ bottom_k, naccess_it->first, naccess_it->second.size(),
+ statistics.c_str());
+ }
+
+ uint32_t top_k_index = 0;
+ for (auto naccess_it = access_count_blocks.rbegin();
+ naccess_it != access_count_blocks.rend(); naccess_it++) {
+ top_k_index++;
+ if (top_k_index >= top_k) {
+ break;
+ }
+ for (auto const& block_id : naccess_it->second) {
+ BlockAccessInfo* block = block_info_map_.find(block_id)->second;
+ std::string statistics("Caller:");
+ uint64_t naccesses = 0;
+ for (auto const& caller_access : block->caller_num_access_map) {
+ if (!user_access_only || is_user_access(caller_access.first)) {
+ naccesses += caller_access.second;
+ }
+ }
+ assert(naccesses > 0);
+ for (auto const& caller_access : block->caller_num_access_map) {
+ if (!user_access_only || is_user_access(caller_access.first)) {
+ statistics += ",";
+ statistics += caller_to_string(caller_access.first);
+ statistics += ":";
+ statistics +=
+ std::to_string(percent(caller_access.second, naccesses));
+ }
+ }
+ uint64_t ref_keys_accesses = 0;
+ uint64_t ref_keys_does_not_exist_accesses = 0;
+ for (auto const& ref_key_caller_access : block->key_num_access_map) {
+ for (auto const& caller_access : ref_key_caller_access.second) {
+ if (!user_access_only || is_user_access(caller_access.first)) {
+ ref_keys_accesses += caller_access.second;
+ }
+ }
+ }
+ for (auto const& ref_key_caller_access :
+ block->non_exist_key_num_access_map) {
+ for (auto const& caller_access : ref_key_caller_access.second) {
+ if (!user_access_only || is_user_access(caller_access.first)) {
+ ref_keys_does_not_exist_accesses += caller_access.second;
+ }
+ }
+ }
+ statistics += ",nkeys=";
+ statistics += std::to_string(block->num_keys);
+ statistics += ",block_size=";
+ statistics += std::to_string(block->block_size);
+ statistics += ",num_ref_keys=";
+ statistics += std::to_string(block->key_num_access_map.size());
+ statistics += ",percent_access_ref_keys=";
+ statistics += std::to_string(percent(ref_keys_accesses, naccesses));
+ statistics += ",num_ref_keys_does_not_exist=";
+ statistics += std::to_string(block->non_exist_key_num_access_map.size());
+ statistics += ",percent_access_ref_keys_does_not_exist=";
+ statistics +=
+ std::to_string(percent(ref_keys_does_not_exist_accesses, naccesses));
+ statistics += ",ref_data_size=";
+ statistics += std::to_string(block->referenced_data_size);
+ fprintf(stdout,
+ "Top %" PRIu32 " access count blocks access_count=%" PRIu64
+ " %s\n",
+ top_k, naccess_it->first, statistics.c_str());
+ }
+ }
+
+ for (auto const& bt_stats : bt_stats_map) {
+ print_break_lines(/*num_break_lines=*/1);
+ fprintf(stdout, "Break down by block type %s: \n%s",
+ block_type_to_string(bt_stats.first).c_str(),
+ bt_stats.second.ToString().c_str());
+ }
+ for (auto const& cf_bt_stats : cf_bt_stats_map) {
+ const std::string& cf_name = cf_bt_stats.first;
+ for (auto const& bt_stats : cf_bt_stats.second) {
+ print_break_lines(/*num_break_lines=*/1);
+ fprintf(stdout,
+ "Break down by column family %s and block type "
+ "%s: \n%s",
+ cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(),
+ bt_stats.second.ToString().c_str());
+ }
+ }
+}
+
+void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
+ HistogramStat existing_keys_stats;
+ std::map<std::string, HistogramStat> cf_existing_keys_stats_map;
+ HistogramStat non_existing_keys_stats;
+ std::map<std::string, HistogramStat> cf_non_existing_keys_stats_map;
+ HistogramStat block_access_stats;
+ std::map<std::string, HistogramStat> cf_block_access_info;
+ HistogramStat percent_referenced_bytes;
+ std::map<std::string, HistogramStat> cf_percent_referenced_bytes;
+ // Total number of accesses in a data block / number of keys in a data block.
+ HistogramStat avg_naccesses_per_key_in_a_data_block;
+ std::map<std::string, HistogramStat> cf_avg_naccesses_per_key_in_a_data_block;
+ // The standard deviation on the number of accesses of a key in a data block.
+ HistogramStat stdev_naccesses_per_key_in_a_data_block;
+ std::map<std::string, HistogramStat>
+ cf_stdev_naccesses_per_key_in_a_data_block;
+ auto block_callback =
+ [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+ TraceType /*type*/, const std::string& /*block_key*/,
+ uint64_t /*block_id*/, const BlockAccessInfo& block) {
+ if (block.num_keys == 0) {
+ return;
+ }
+ // Use four decimal points.
+ uint64_t percent_referenced_for_existing_keys = (uint64_t)(
+ ((double)block.key_num_access_map.size() / (double)block.num_keys) *
+ 10000.0);
+ uint64_t percent_referenced_for_non_existing_keys =
+ (uint64_t)(((double)block.non_exist_key_num_access_map.size() /
+ (double)block.num_keys) *
+ 10000.0);
+ uint64_t percent_accesses_for_existing_keys =
+ (uint64_t)(((double)block.num_referenced_key_exist_in_block /
+ (double)block.num_accesses) *
+ 10000.0);
+
+ HistogramStat hist_naccess_per_key;
+ for (auto const& key_access : block.key_num_access_map) {
+ for (auto const& caller_access : key_access.second) {
+ hist_naccess_per_key.Add(caller_access.second);
+ }
+ }
+ uint64_t avg_accesses =
+ static_cast<uint64_t>(hist_naccess_per_key.Average());
+ uint64_t stdev_accesses =
+ static_cast<uint64_t>(hist_naccess_per_key.StandardDeviation());
+ avg_naccesses_per_key_in_a_data_block.Add(avg_accesses);
+ cf_avg_naccesses_per_key_in_a_data_block[cf_name].Add(avg_accesses);
+ stdev_naccesses_per_key_in_a_data_block.Add(stdev_accesses);
+ cf_stdev_naccesses_per_key_in_a_data_block[cf_name].Add(stdev_accesses);
+
+ existing_keys_stats.Add(percent_referenced_for_existing_keys);
+ cf_existing_keys_stats_map[cf_name].Add(
+ percent_referenced_for_existing_keys);
+ non_existing_keys_stats.Add(percent_referenced_for_non_existing_keys);
+ cf_non_existing_keys_stats_map[cf_name].Add(
+ percent_referenced_for_non_existing_keys);
+ block_access_stats.Add(percent_accesses_for_existing_keys);
+ cf_block_access_info[cf_name].Add(percent_accesses_for_existing_keys);
+ };
+ TraverseBlocks(block_callback);
+ fprintf(stdout,
+ "Histogram on the number of referenced keys existing in a block over "
+ "the total number of keys in a block: \n%s",
+ existing_keys_stats.ToString().c_str());
+ for (auto const& cf_stats : cf_existing_keys_stats_map) {
+ print_break_lines(/*num_break_lines=*/1);
+ fprintf(stdout, "Break down by column family %s: \n%s",
+ cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+ }
+ print_break_lines(/*num_break_lines=*/1);
+ fprintf(
+ stdout,
+ "Histogram on the number of referenced keys DO NOT exist in a block over "
+ "the total number of keys in a block: \n%s",
+ non_existing_keys_stats.ToString().c_str());
+ for (auto const& cf_stats : cf_non_existing_keys_stats_map) {
+ print_break_lines(/*num_break_lines=*/1);
+ fprintf(stdout, "Break down by column family %s: \n%s",
+ cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+ }
+ print_break_lines(/*num_break_lines=*/1);
+ fprintf(stdout,
+ "Histogram on the number of accesses on keys exist in a block over "
+ "the total number of accesses in a block: \n%s",
+ block_access_stats.ToString().c_str());
+ for (auto const& cf_stats : cf_block_access_info) {
+ print_break_lines(/*num_break_lines=*/1);
+ fprintf(stdout, "Break down by column family %s: \n%s",
+ cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+ }
+ print_break_lines(/*num_break_lines=*/1);
+ fprintf(
+ stdout,
+ "Histogram on the average number of accesses per key in a block: \n%s",
+ avg_naccesses_per_key_in_a_data_block.ToString().c_str());
+ for (auto const& cf_stats : cf_avg_naccesses_per_key_in_a_data_block) {
+ fprintf(stdout, "Break down by column family %s: \n%s",
+ cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+ }
+ print_break_lines(/*num_break_lines=*/1);
+ fprintf(stdout,
+ "Histogram on the standard deviation of the number of accesses per "
+ "key in a block: \n%s",
+ stdev_naccesses_per_key_in_a_data_block.ToString().c_str());
+ for (auto const& cf_stats : cf_stdev_naccesses_per_key_in_a_data_block) {
+ fprintf(stdout, "Break down by column family %s: \n%s",
+ cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+ }
+}
+
+void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
+ uint64_t total_num_files = 0;
+ uint64_t total_num_blocks = 0;
+ uint64_t total_num_accesses = 0;
+ std::map<TraceType, uint64_t> bt_num_blocks_map;
+ std::map<TableReaderCaller, uint64_t> caller_num_access_map;
+ std::map<TableReaderCaller, std::map<TraceType, uint64_t>>
+ caller_bt_num_access_map;
+ std::map<TableReaderCaller, std::map<uint32_t, uint64_t>>
+ caller_level_num_access_map;
+ for (auto const& cf_aggregates : cf_aggregates_map_) {
+ // Stats per column family.
+ const std::string& cf_name = cf_aggregates.first;
+ uint64_t cf_num_files = 0;
+ uint64_t cf_num_blocks = 0;
+ std::map<TraceType, uint64_t> cf_bt_blocks;
+ uint64_t cf_num_accesses = 0;
+ std::map<TableReaderCaller, uint64_t> cf_caller_num_accesses_map;
+ std::map<TableReaderCaller, std::map<uint64_t, uint64_t>>
+ cf_caller_level_num_accesses_map;
+ std::map<TableReaderCaller, std::map<uint64_t, uint64_t>>
+ cf_caller_file_num_accesses_map;
+ std::map<TableReaderCaller, std::map<TraceType, uint64_t>>
+ cf_caller_bt_num_accesses_map;
+ total_num_files += cf_aggregates.second.fd_aggregates_map.size();
+ for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+ // Stats per SST file.
+ const uint64_t fd = file_aggregates.first;
+ const uint32_t level = file_aggregates.second.level;
+ cf_num_files++;
+ for (auto const& block_type_aggregates :
+ file_aggregates.second.block_type_aggregates_map) {
+ // Stats per block type.
+ const TraceType type = block_type_aggregates.first;
+ cf_bt_blocks[type] +=
+ block_type_aggregates.second.block_access_info_map.size();
+ total_num_blocks +=
+ block_type_aggregates.second.block_access_info_map.size();
+ bt_num_blocks_map[type] +=
+ block_type_aggregates.second.block_access_info_map.size();
+ for (auto const& block_access_info :
+ block_type_aggregates.second.block_access_info_map) {
+ // Stats per block.
+ cf_num_blocks++;
+ for (auto const& stats :
+ block_access_info.second.caller_num_access_map) {
+ // Stats per caller.
+ const TableReaderCaller caller = stats.first;
+ const uint64_t num_accesses = stats.second;
+ // Overall stats.
+ total_num_accesses += num_accesses;
+ caller_num_access_map[caller] += num_accesses;
+ caller_bt_num_access_map[caller][type] += num_accesses;
+ caller_level_num_access_map[caller][level] += num_accesses;
+ // Column Family stats.
+ cf_num_accesses += num_accesses;
+ cf_caller_num_accesses_map[caller] += num_accesses;
+ cf_caller_level_num_accesses_map[caller][level] += num_accesses;
+ cf_caller_file_num_accesses_map[caller][fd] += num_accesses;
+ cf_caller_bt_num_accesses_map[caller][type] += num_accesses;
+ }
+ }
+ }
+ }
+
+ // Print stats.
+ print_break_lines(/*num_break_lines=*/3);
+ fprintf(stdout, "Statistics for column family %s:\n", cf_name.c_str());
+ fprintf(stdout,
+ " Number of files:%" PRIu64 " Number of blocks: %" PRIu64
+ " Number of accesses: %" PRIu64 "\n",
+ cf_num_files, cf_num_blocks, cf_num_accesses);
+ for (auto block_type : cf_bt_blocks) {
+ fprintf(stdout, "Number of %s blocks: %" PRIu64 " Percent: %.2f\n",
+ block_type_to_string(block_type.first).c_str(), block_type.second,
+ percent(block_type.second, cf_num_blocks));
+ }
+ for (auto caller : cf_caller_num_accesses_map) {
+ const uint64_t naccesses = caller.second;
+ print_break_lines(/*num_break_lines=*/1);
+ fprintf(stdout,
+ "Caller %s: Number of accesses %" PRIu64 " Percent: %.2f\n",
+ caller_to_string(caller.first).c_str(), naccesses,
+ percent(naccesses, cf_num_accesses));
+ fprintf(stdout, "Caller %s: Number of accesses per level break down\n",
+ caller_to_string(caller.first).c_str());
+ for (auto naccess_level :
+ cf_caller_level_num_accesses_map[caller.first]) {
+ fprintf(stdout,
+ "\t Level %" PRIu64 ": Number of accesses: %" PRIu64
+ " Percent: %.2f\n",
+ naccess_level.first, naccess_level.second,
+ percent(naccess_level.second, naccesses));
+ }
+ fprintf(stdout, "Caller %s: Number of accesses per file break down\n",
+ caller_to_string(caller.first).c_str());
+ for (auto naccess_file : cf_caller_file_num_accesses_map[caller.first]) {
+ fprintf(stdout,
+ "\t File %" PRIu64 ": Number of accesses: %" PRIu64
+ " Percent: %.2f\n",
+ naccess_file.first, naccess_file.second,
+ percent(naccess_file.second, naccesses));
+ }
+ fprintf(stdout,
+ "Caller %s: Number of accesses per block type break down\n",
+ caller_to_string(caller.first).c_str());
+ for (auto naccess_type : cf_caller_bt_num_accesses_map[caller.first]) {
+ fprintf(stdout,
+ "\t Block Type %s: Number of accesses: %" PRIu64
+ " Percent: %.2f\n",
+ block_type_to_string(naccess_type.first).c_str(),
+ naccess_type.second, percent(naccess_type.second, naccesses));
+ }
+ }
+ }
+ print_break_lines(/*num_break_lines=*/3);
+ fprintf(stdout, "Overall statistics:\n");
+ fprintf(stdout,
+ "Number of files: %" PRIu64 " Number of blocks: %" PRIu64
+ " Number of accesses: %" PRIu64 "\n",
+ total_num_files, total_num_blocks, total_num_accesses);
+ for (auto block_type : bt_num_blocks_map) {
+ fprintf(stdout, "Number of %s blocks: %" PRIu64 " Percent: %.2f\n",
+ block_type_to_string(block_type.first).c_str(), block_type.second,
+ percent(block_type.second, total_num_blocks));
+ }
+ for (auto caller : caller_num_access_map) {
+ print_break_lines(/*num_break_lines=*/1);
+ uint64_t naccesses = caller.second;
+ fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 " Percent: %.2f\n",
+ caller_to_string(caller.first).c_str(), naccesses,
+ percent(naccesses, total_num_accesses));
+ fprintf(stdout, "Caller %s: Number of accesses per level break down\n",
+ caller_to_string(caller.first).c_str());
+ for (auto naccess_level : caller_level_num_access_map[caller.first]) {
+ fprintf(stdout,
+ "\t Level %d: Number of accesses: %" PRIu64 " Percent: %.2f\n",
+ naccess_level.first, naccess_level.second,
+ percent(naccess_level.second, naccesses));
+ }
+ fprintf(stdout, "Caller %s: Number of accesses per block type break down\n",
+ caller_to_string(caller.first).c_str());
+ for (auto naccess_type : caller_bt_num_access_map[caller.first]) {
+ fprintf(stdout,
+ "\t Block Type %s: Number of accesses: %" PRIu64
+ " Percent: %.2f\n",
+ block_type_to_string(naccess_type.first).c_str(),
+ naccess_type.second, percent(naccess_type.second, naccesses));
+ }
+ }
+}
+
+std::vector<CacheConfiguration> parse_cache_config_file(
+ const std::string& config_path) {
+ std::ifstream file(config_path);
+ if (!file.is_open()) {
+ return {};
+ }
+ std::vector<CacheConfiguration> configs;
+ std::string line;
+ while (getline(file, line)) {
+ CacheConfiguration cache_config;
+ std::stringstream ss(line);
+ std::vector<std::string> config_strs;
+ while (ss.good()) {
+ std::string substr;
+ getline(ss, substr, ',');
+ config_strs.push_back(substr);
+ }
+ // Sanity checks.
+ if (config_strs.size() < 4) {
+ fprintf(stderr, "Invalid cache simulator configuration %s\n",
+ line.c_str());
+ exit(1);
+ }
+ if (kSupportedCacheNames.find(" " + config_strs[0] + " ") ==
+ std::string::npos) {
+ fprintf(stderr, "Invalid cache name %s. Supported cache names are %s\n",
+ line.c_str(), kSupportedCacheNames.c_str());
+ exit(1);
+ }
+ cache_config.cache_name = config_strs[0];
+ cache_config.num_shard_bits = ParseUint32(config_strs[1]);
+ cache_config.ghost_cache_capacity = ParseUint64(config_strs[2]);
+ for (uint32_t i = 3; i < config_strs.size(); i++) {
+ uint64_t capacity = ParseUint64(config_strs[i]);
+ if (capacity == 0) {
+ fprintf(stderr, "Invalid cache capacity %s, %s\n",
+ config_strs[i].c_str(), line.c_str());
+ exit(1);
+ }
+ cache_config.cache_capacities.push_back(capacity);
+ }
+ configs.push_back(cache_config);
+ }
+ file.close();
+ return configs;
+}
+
+std::vector<uint64_t> parse_buckets(const std::string& bucket_str) {
+ std::vector<uint64_t> buckets;
+ std::stringstream ss(bucket_str);
+ while (ss.good()) {
+ std::string bucket;
+ getline(ss, bucket, ',');
+ buckets.push_back(ParseUint64(bucket));
+ }
+ buckets.push_back(port::kMaxUint64);
+ return buckets;
+}
+
+int block_cache_trace_analyzer_tool(int argc, char** argv) {
+ ParseCommandLineFlags(&argc, &argv, true);
+ if (FLAGS_block_cache_trace_path.empty()) {
+ fprintf(stderr, "block cache trace path is empty\n");
+ exit(1);
+ }
+ uint64_t warmup_seconds =
+ FLAGS_cache_sim_warmup_seconds > 0 ? FLAGS_cache_sim_warmup_seconds : 0;
+ uint32_t downsample_ratio = FLAGS_block_cache_trace_downsample_ratio > 0
+ ? FLAGS_block_cache_trace_downsample_ratio
+ : 0;
+ std::vector<CacheConfiguration> cache_configs =
+ parse_cache_config_file(FLAGS_block_cache_sim_config_path);
+ std::unique_ptr<BlockCacheTraceSimulator> cache_simulator;
+ if (!cache_configs.empty()) {
+ cache_simulator.reset(new BlockCacheTraceSimulator(
+ warmup_seconds, downsample_ratio, cache_configs));
+ Status s = cache_simulator->InitializeCaches();
+ if (!s.ok()) {
+ fprintf(stderr, "Cannot initialize cache simulators %s\n",
+ s.ToString().c_str());
+ exit(1);
+ }
+ }
+ BlockCacheTraceAnalyzer analyzer(
+ FLAGS_block_cache_trace_path, FLAGS_block_cache_analysis_result_dir,
+ FLAGS_human_readable_trace_file_path,
+ !FLAGS_reuse_distance_labels.empty(), FLAGS_mrc_only,
+ FLAGS_is_block_cache_human_readable_trace, std::move(cache_simulator));
+ Status s = analyzer.Analyze();
+ if (!s.IsIncomplete() && !s.ok()) {
+ // Read all traces.
+ fprintf(stderr, "Cannot process the trace %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ fprintf(stdout, "Status: %s\n", s.ToString().c_str());
+ analyzer.WriteMissRatioCurves();
+ analyzer.WriteMissRatioTimeline(1);
+ analyzer.WriteMissRatioTimeline(kSecondInMinute);
+ analyzer.WriteMissRatioTimeline(kSecondInHour);
+ analyzer.WriteMissTimeline(1);
+ analyzer.WriteMissTimeline(kSecondInMinute);
+ analyzer.WriteMissTimeline(kSecondInHour);
+
+ if (FLAGS_mrc_only) {
+ fprintf(stdout,
+ "Skipping the analysis statistics since the user wants to compute "
+ "MRC only");
+ return 0;
+ }
+
+ analyzer.PrintStatsSummary();
+ if (FLAGS_print_access_count_stats) {
+ print_break_lines(/*num_break_lines=*/3);
+ analyzer.PrintAccessCountStats(
+ /*user_access_only=*/false, FLAGS_analyze_bottom_k_access_count_blocks,
+ FLAGS_analyze_top_k_access_count_blocks);
+ print_break_lines(/*num_break_lines=*/3);
+ analyzer.PrintAccessCountStats(
+ /*user_access_only=*/true, FLAGS_analyze_bottom_k_access_count_blocks,
+ FLAGS_analyze_top_k_access_count_blocks);
+ }
+ if (FLAGS_print_block_size_stats) {
+ print_break_lines(/*num_break_lines=*/3);
+ analyzer.PrintBlockSizeStats();
+ }
+ if (FLAGS_print_data_block_access_count_stats) {
+ print_break_lines(/*num_break_lines=*/3);
+ analyzer.PrintDataBlockAccessStats();
+ }
+ print_break_lines(/*num_break_lines=*/3);
+
+ if (!FLAGS_timeline_labels.empty()) {
+ std::stringstream ss(FLAGS_timeline_labels);
+ while (ss.good()) {
+ std::string label;
+ getline(ss, label, ',');
+ if (label.find("block") != std::string::npos) {
+ analyzer.WriteAccessTimeline(label, kSecondInMinute, true);
+ analyzer.WriteAccessTimeline(label, kSecondInMinute, false);
+ analyzer.WriteAccessTimeline(label, kSecondInHour, true);
+ analyzer.WriteAccessTimeline(label, kSecondInHour, false);
+ } else {
+ analyzer.WriteAccessTimeline(label, kSecondInMinute, false);
+ analyzer.WriteAccessTimeline(label, kSecondInHour, false);
+ }
+ }
+ }
+
+ if (!FLAGS_analyze_callers.empty()) {
+ analyzer.WritePercentAccessSummaryStats();
+ std::stringstream ss(FLAGS_analyze_callers);
+ while (ss.good()) {
+ std::string caller;
+ getline(ss, caller, ',');
+ analyzer.WriteDetailedPercentAccessSummaryStats(string_to_caller(caller));
+ }
+ }
+
+ if (!FLAGS_access_count_buckets.empty()) {
+ std::vector<uint64_t> buckets = parse_buckets(FLAGS_access_count_buckets);
+ analyzer.WriteAccessCountSummaryStats(buckets, /*user_access_only=*/true);
+ analyzer.WriteAccessCountSummaryStats(buckets, /*user_access_only=*/false);
+ }
+
+ if (!FLAGS_reuse_distance_labels.empty() &&
+ !FLAGS_reuse_distance_buckets.empty()) {
+ std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_distance_buckets);
+ std::stringstream ss(FLAGS_reuse_distance_labels);
+ while (ss.good()) {
+ std::string label;
+ getline(ss, label, ',');
+ analyzer.WriteReuseDistance(label, buckets);
+ }
+ }
+
+ if (!FLAGS_reuse_interval_labels.empty() &&
+ !FLAGS_reuse_interval_buckets.empty()) {
+ std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_interval_buckets);
+ std::stringstream ss(FLAGS_reuse_interval_labels);
+ while (ss.good()) {
+ std::string label;
+ getline(ss, label, ',');
+ analyzer.WriteReuseInterval(label, buckets);
+ }
+ }
+
+ if (!FLAGS_reuse_lifetime_labels.empty() &&
+ !FLAGS_reuse_lifetime_buckets.empty()) {
+ std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_lifetime_buckets);
+ std::stringstream ss(FLAGS_reuse_lifetime_labels);
+ while (ss.good()) {
+ std::string label;
+ getline(ss, label, ',');
+ analyzer.WriteReuseLifetime(label, buckets);
+ }
+ }
+
+ if (FLAGS_analyze_blocks_reuse_k_reuse_window != 0) {
+ std::vector<TraceType> block_types{TraceType::kBlockTraceIndexBlock,
+ TraceType::kBlockTraceDataBlock,
+ TraceType::kBlockTraceFilterBlock};
+ for (auto block_type : block_types) {
+ analyzer.WriteBlockReuseTimeline(
+ FLAGS_analyze_blocks_reuse_k_reuse_window,
+ /*user_access_only=*/true, block_type);
+ analyzer.WriteBlockReuseTimeline(
+ FLAGS_analyze_blocks_reuse_k_reuse_window,
+ /*user_access_only=*/false, block_type);
+ }
+ }
+
+ if (!FLAGS_analyze_get_spatial_locality_labels.empty() &&
+ !FLAGS_analyze_get_spatial_locality_buckets.empty()) {
+ std::vector<uint64_t> buckets =
+ parse_buckets(FLAGS_analyze_get_spatial_locality_buckets);
+ std::stringstream ss(FLAGS_analyze_get_spatial_locality_labels);
+ while (ss.good()) {
+ std::string label;
+ getline(ss, label, ',');
+ analyzer.WriteGetSpatialLocality(label, buckets);
+ }
+ }
+
+ if (!FLAGS_analyze_correlation_coefficients_labels.empty()) {
+ std::stringstream ss(FLAGS_analyze_correlation_coefficients_labels);
+ while (ss.good()) {
+ std::string label;
+ getline(ss, label, ',');
+ analyzer.WriteCorrelationFeatures(
+ label, FLAGS_analyze_correlation_coefficients_max_number_of_values);
+ }
+ analyzer.WriteCorrelationFeaturesForGet(
+ FLAGS_analyze_correlation_coefficients_max_number_of_values);
+ }
+
+ if (!FLAGS_skew_labels.empty() && !FLAGS_skew_buckets.empty()) {
+ std::vector<uint64_t> buckets = parse_buckets(FLAGS_skew_buckets);
+ std::stringstream ss(FLAGS_skew_labels);
+ while (ss.good()) {
+ std::string label;
+ getline(ss, label, ',');
+ if (label.find("block") != std::string::npos) {
+ analyzer.WriteSkewness(label, buckets,
+ TraceType::kBlockTraceIndexBlock);
+ analyzer.WriteSkewness(label, buckets,
+ TraceType::kBlockTraceFilterBlock);
+ analyzer.WriteSkewness(label, buckets, TraceType::kBlockTraceDataBlock);
+ analyzer.WriteSkewness(label, buckets, TraceType::kTraceMax);
+ } else {
+ analyzer.WriteSkewness(label, buckets, TraceType::kTraceMax);
+ }
+ }
+ }
+ return 0;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // GFLAGS
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h
new file mode 100644
index 000000000..48a544813
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h
@@ -0,0 +1,393 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/utilities/sim_cache.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "utilities/simulator_cache/cache_simulator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Statistics of a key refereneced by a Get.
+struct GetKeyInfo {
+ uint64_t key_id = 0;
+ std::vector<uint64_t> access_sequence_number_timeline;
+ std::vector<uint64_t> access_timeline;
+
+ void AddAccess(const BlockCacheTraceRecord& access,
+ uint64_t access_sequnce_number) {
+ access_sequence_number_timeline.push_back(access_sequnce_number);
+ access_timeline.push_back(access.access_timestamp);
+ }
+};
+
+// Statistics of a block.
+struct BlockAccessInfo {
+ uint64_t block_id = 0;
+ uint64_t table_id = 0;
+ uint64_t block_offset = 0;
+ uint64_t num_accesses = 0;
+ uint64_t block_size = 0;
+ uint64_t first_access_time = 0;
+ uint64_t last_access_time = 0;
+ uint64_t num_keys = 0;
+ std::map<std::string, std::map<TableReaderCaller, uint64_t>>
+ key_num_access_map; // for keys exist in this block.
+ std::map<std::string, std::map<TableReaderCaller, uint64_t>>
+ non_exist_key_num_access_map; // for keys do not exist in this block.
+ uint64_t num_referenced_key_exist_in_block = 0;
+ uint64_t referenced_data_size = 0;
+ std::map<TableReaderCaller, uint64_t> caller_num_access_map;
+ // caller:timestamp:number_of_accesses. The granularity of the timestamp is
+ // seconds.
+ std::map<TableReaderCaller, std::map<uint64_t, uint64_t>>
+ caller_num_accesses_timeline;
+ // Unique blocks since the last access.
+ std::set<std::string> unique_blocks_since_last_access;
+ // Number of reuses grouped by reuse distance.
+ std::map<uint64_t, uint64_t> reuse_distance_count;
+
+ // The access sequence numbers of this block.
+ std::vector<uint64_t> access_sequence_number_timeline;
+ std::map<TableReaderCaller, std::vector<uint64_t>>
+ caller_access_sequence__number_timeline;
+ // The access timestamp in microseconds of this block.
+ std::vector<uint64_t> access_timeline;
+ std::map<TableReaderCaller, std::vector<uint64_t>> caller_access_timeline;
+
+ void AddAccess(const BlockCacheTraceRecord& access,
+ uint64_t access_sequnce_number) {
+ if (block_size != 0 && access.block_size != 0) {
+ assert(block_size == access.block_size);
+ }
+ if (num_keys != 0 && access.num_keys_in_block != 0) {
+ assert(num_keys == access.num_keys_in_block);
+ }
+ if (first_access_time == 0) {
+ first_access_time = access.access_timestamp;
+ }
+ table_id = BlockCacheTraceHelper::GetTableId(access);
+ block_offset = BlockCacheTraceHelper::GetBlockOffsetInFile(access);
+ last_access_time = access.access_timestamp;
+ block_size = access.block_size;
+ caller_num_access_map[access.caller]++;
+ num_accesses++;
+ // access.access_timestamp is in microsecond.
+ const uint64_t timestamp_in_seconds =
+ access.access_timestamp / kMicrosInSecond;
+ caller_num_accesses_timeline[access.caller][timestamp_in_seconds] += 1;
+ // Populate the feature vectors.
+ access_sequence_number_timeline.push_back(access_sequnce_number);
+ caller_access_sequence__number_timeline[access.caller].push_back(
+ access_sequnce_number);
+ access_timeline.push_back(access.access_timestamp);
+ caller_access_timeline[access.caller].push_back(access.access_timestamp);
+ if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(access.block_type,
+ access.caller)) {
+ num_keys = access.num_keys_in_block;
+ if (access.referenced_key_exist_in_block == Boolean::kTrue) {
+ if (key_num_access_map.find(access.referenced_key) ==
+ key_num_access_map.end()) {
+ referenced_data_size += access.referenced_data_size;
+ }
+ key_num_access_map[access.referenced_key][access.caller]++;
+ num_referenced_key_exist_in_block++;
+ if (referenced_data_size > block_size && block_size != 0) {
+ ParsedInternalKey internal_key;
+ ParseInternalKey(access.referenced_key, &internal_key);
+ }
+ } else {
+ non_exist_key_num_access_map[access.referenced_key][access.caller]++;
+ }
+ }
+ }
+};
+
+// Aggregates stats of a block given a block type.
+struct BlockTypeAccessInfoAggregate {
+ std::map<std::string, BlockAccessInfo> block_access_info_map;
+};
+
+// Aggregates BlockTypeAggregate given a SST file.
+struct SSTFileAccessInfoAggregate {
+ uint32_t level;
+ std::map<TraceType, BlockTypeAccessInfoAggregate> block_type_aggregates_map;
+};
+
+// Aggregates SSTFileAggregate given a column family.
+struct ColumnFamilyAccessInfoAggregate {
+ std::map<uint64_t, SSTFileAccessInfoAggregate> fd_aggregates_map;
+};
+
+struct Features {
+ std::vector<uint64_t> elapsed_time_since_last_access;
+ std::vector<uint64_t> num_accesses_since_last_access;
+ std::vector<uint64_t> num_past_accesses;
+};
+
+struct Predictions {
+ std::vector<uint64_t> elapsed_time_till_next_access;
+ std::vector<uint64_t> num_accesses_till_next_access;
+};
+
+class BlockCacheTraceAnalyzer {
+ public:
+ BlockCacheTraceAnalyzer(
+ const std::string& trace_file_path, const std::string& output_dir,
+ const std::string& human_readable_trace_file_path,
+ bool compute_reuse_distance, bool mrc_only,
+ bool is_human_readable_trace_file,
+ std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator);
+ ~BlockCacheTraceAnalyzer() = default;
+ // No copy and move.
+ BlockCacheTraceAnalyzer(const BlockCacheTraceAnalyzer&) = delete;
+ BlockCacheTraceAnalyzer& operator=(const BlockCacheTraceAnalyzer&) = delete;
+ BlockCacheTraceAnalyzer(BlockCacheTraceAnalyzer&&) = delete;
+ BlockCacheTraceAnalyzer& operator=(BlockCacheTraceAnalyzer&&) = delete;
+
+ // Read all access records in the given trace_file, maintains the stats of
+ // a block, and aggregates the information by block type, sst file, and column
+ // family. Subsequently, the caller may call Print* functions to print
+ // statistics.
+ Status Analyze();
+
+ // Print a summary of statistics of the trace, e.g.,
+ // Number of files: 2 Number of blocks: 50 Number of accesses: 50
+ // Number of Index blocks: 10
+ // Number of Filter blocks: 10
+ // Number of Data blocks: 10
+ // Number of UncompressionDict blocks: 10
+ // Number of RangeDeletion blocks: 10
+ // ***************************************************************
+ // Caller Get: Number of accesses 10
+ // Caller Get: Number of accesses per level break down
+ // Level 0: Number of accesses: 10
+ // Caller Get: Number of accesses per block type break down
+ // Block Type Index: Number of accesses: 2
+ // Block Type Filter: Number of accesses: 2
+ // Block Type Data: Number of accesses: 2
+ // Block Type UncompressionDict: Number of accesses: 2
+ // Block Type RangeDeletion: Number of accesses: 2
+ void PrintStatsSummary() const;
+
+ // Print block size distribution and the distribution break down by block type
+ // and column family.
+ void PrintBlockSizeStats() const;
+
+ // Print access count distribution and the distribution break down by block
+ // type and column family.
+ void PrintAccessCountStats(bool user_access_only, uint32_t bottom_k,
+ uint32_t top_k) const;
+
+ // Print data block accesses by user Get and Multi-Get.
+ // It prints out 1) A histogram on the percentage of keys accessed in a data
+ // block break down by if a referenced key exists in the data block andthe
+ // histogram break down by column family. 2) A histogram on the percentage of
+ // accesses on keys exist in a data block and its break down by column family.
+ void PrintDataBlockAccessStats() const;
+
+ // Write the percentage of accesses break down by column family into a csv
+ // file saved in 'output_dir'.
+ //
+ // The file is named "percentage_of_accesses_summary". The file format is
+ // caller,cf_0,cf_1,...,cf_n where the cf_i is the column family name found in
+ // the trace.
+ void WritePercentAccessSummaryStats() const;
+
+ // Write the percentage of accesses for the given caller break down by column
+ // family, level, and block type into a csv file saved in 'output_dir'.
+ //
+ // It generates two files: 1) caller_level_percentage_of_accesses_summary and
+ // 2) caller_bt_percentage_of_accesses_summary which break down by the level
+ // and block type, respectively. The file format is
+ // level/bt,cf_0,cf_1,...,cf_n where cf_i is the column family name found in
+ // the trace.
+ void WriteDetailedPercentAccessSummaryStats(TableReaderCaller caller) const;
+
+ // Write the access count summary into a csv file saved in 'output_dir'.
+ // It groups blocks by their access count.
+ //
+ // It generates two files: 1) cf_access_count_summary and 2)
+ // bt_access_count_summary which break down the access count by column family
+ // and block type, respectively. The file format is
+ // cf/bt,bucket_0,bucket_1,...,bucket_N.
+ void WriteAccessCountSummaryStats(
+ const std::vector<uint64_t>& access_count_buckets,
+ bool user_access_only) const;
+
+ // Write miss ratio curves of simulated cache configurations into a csv file
+ // named "mrc" saved in 'output_dir'.
+ //
+ // The file format is
+ // "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses".
+ void WriteMissRatioCurves() const;
+
+ // Write miss ratio timeline of simulated cache configurations into several
+ // csv files, one per cache capacity saved in 'output_dir'.
+ //
+ // The file format is
+ // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second"
+ // where N is the number of unique cache names
+ // (cache_name+num_shard_bits+ghost_capacity).
+ void WriteMissRatioTimeline(uint64_t time_unit) const;
+
+ // Write misses timeline of simulated cache configurations into several
+ // csv files, one per cache capacity saved in 'output_dir'.
+ //
+ // The file format is
+ // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second"
+ // where N is the number of unique cache names
+ // (cache_name+num_shard_bits+ghost_capacity).
+ void WriteMissTimeline(uint64_t time_unit) const;
+
+ // Write the access timeline into a csv file saved in 'output_dir'.
+ //
+ // The file is named "label_access_timeline".The file format is
+ // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second"
+ // where N is the number of unique labels found in the trace.
+ void WriteAccessTimeline(const std::string& label, uint64_t time_unit,
+ bool user_access_only) const;
+
+ // Write the reuse distance into a csv file saved in 'output_dir'. Reuse
+ // distance is defined as the cumulated size of unique blocks read between two
+ // consective accesses on the same block.
+ //
+ // The file is named "label_reuse_distance". The file format is
+ // bucket,label_1,label_2,...,label_N.
+ void WriteReuseDistance(const std::string& label_str,
+ const std::vector<uint64_t>& distance_buckets) const;
+
+ // Write the reuse interval into a csv file saved in 'output_dir'. Reuse
+ // interval is defined as the time between two consecutive accesses on the
+ // same block.
+ //
+ // The file is named "label_reuse_interval". The file format is
+ // bucket,label_1,label_2,...,label_N.
+ void WriteReuseInterval(const std::string& label_str,
+ const std::vector<uint64_t>& time_buckets) const;
+
+ // Write the reuse lifetime into a csv file saved in 'output_dir'. Reuse
+ // lifetime is defined as the time interval between the first access of a
+ // block and its last access.
+ //
+ // The file is named "label_reuse_lifetime". The file format is
+ // bucket,label_1,label_2,...,label_N.
+ void WriteReuseLifetime(const std::string& label_str,
+ const std::vector<uint64_t>& time_buckets) const;
+
+ // Write the reuse timeline into a csv file saved in 'output_dir'.
+ //
+ // The file is named
+ // "block_type_user_access_only_reuse_window_reuse_timeline". The file format
+ // is start_time,0,1,...,N where N equals trace_duration / reuse_window.
+ void WriteBlockReuseTimeline(const uint64_t reuse_window, bool user_access_only,
+ TraceType block_type) const;
+
+ // Write the Get spatical locality into csv files saved in 'output_dir'.
+ //
+ // It generates three csv files. label_percent_ref_keys,
+ // label_percent_accesses_on_ref_keys, and
+ // label_percent_data_size_on_ref_keys.
+ void WriteGetSpatialLocality(
+ const std::string& label_str,
+ const std::vector<uint64_t>& percent_buckets) const;
+
+ void WriteCorrelationFeatures(const std::string& label_str,
+ uint32_t max_number_of_values) const;
+
+ void WriteCorrelationFeaturesForGet(uint32_t max_number_of_values) const;
+
+ void WriteSkewness(const std::string& label_str,
+ const std::vector<uint64_t>& percent_buckets,
+ TraceType target_block_type) const;
+
+ const std::map<std::string, ColumnFamilyAccessInfoAggregate>&
+ TEST_cf_aggregates_map() const {
+ return cf_aggregates_map_;
+ }
+
+ private:
+ std::set<std::string> ParseLabelStr(const std::string& label_str) const;
+
+ std::string BuildLabel(const std::set<std::string>& labels,
+ const std::string& cf_name, uint64_t fd,
+ uint32_t level, TraceType type,
+ TableReaderCaller caller, uint64_t block_key,
+ const BlockAccessInfo& block) const;
+
+ void ComputeReuseDistance(BlockAccessInfo* info) const;
+
+ Status RecordAccess(const BlockCacheTraceRecord& access);
+
+ void UpdateReuseIntervalStats(
+ const std::string& label, const std::vector<uint64_t>& time_buckets,
+ const std::map<uint64_t, uint64_t> timeline,
+ std::map<std::string, std::map<uint64_t, uint64_t>>*
+ label_time_num_reuses,
+ uint64_t* total_num_reuses) const;
+
+ std::string OutputPercentAccessStats(
+ uint64_t total_accesses,
+ const std::map<std::string, uint64_t>& cf_access_count) const;
+
+ void WriteStatsToFile(
+ const std::string& label_str, const std::vector<uint64_t>& time_buckets,
+ const std::string& filename_suffix,
+ const std::map<std::string, std::map<uint64_t, uint64_t>>& label_data,
+ uint64_t ntotal) const;
+
+ void TraverseBlocks(
+ std::function<void(const std::string& /*cf_name*/, uint64_t /*fd*/,
+ uint32_t /*level*/, TraceType /*block_type*/,
+ const std::string& /*block_key*/,
+ uint64_t /*block_key_id*/,
+ const BlockAccessInfo& /*block_access_info*/)>
+ block_callback,
+ std::set<std::string>* labels = nullptr) const;
+
+ void UpdateFeatureVectors(
+ const std::vector<uint64_t>& access_sequence_number_timeline,
+ const std::vector<uint64_t>& access_timeline, const std::string& label,
+ std::map<std::string, Features>* label_features,
+ std::map<std::string, Predictions>* label_predictions) const;
+
+ void WriteCorrelationFeaturesToFile(
+ const std::string& label,
+ const std::map<std::string, Features>& label_features,
+ const std::map<std::string, Predictions>& label_predictions,
+ uint32_t max_number_of_values) const;
+
+ ROCKSDB_NAMESPACE::Env* env_;
+ const std::string trace_file_path_;
+ const std::string output_dir_;
+ std::string human_readable_trace_file_path_;
+ const bool compute_reuse_distance_;
+ const bool mrc_only_;
+ const bool is_human_readable_trace_file_;
+
+ BlockCacheTraceHeader header_;
+ std::unique_ptr<BlockCacheTraceSimulator> cache_simulator_;
+ std::map<std::string, ColumnFamilyAccessInfoAggregate> cf_aggregates_map_;
+ std::map<std::string, BlockAccessInfo*> block_info_map_;
+ std::unordered_map<std::string, GetKeyInfo> get_key_info_map_;
+ uint64_t access_sequence_number_ = 0;
+ uint64_t trace_start_timestamp_in_seconds_ = 0;
+ uint64_t trace_end_timestamp_in_seconds_ = 0;
+ MissRatioStats miss_ratio_stats_;
+ uint64_t unique_block_id_ = 1;
+ uint64_t unique_get_key_id_ = 1;
+ BlockCacheHumanReadableTraceWriter human_readable_trace_writer_;
+};
+
+int block_cache_trace_analyzer_tool(int argc, char** argv);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py
new file mode 100644
index 000000000..0fdaa4158
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py
@@ -0,0 +1,721 @@
+#!/usr/bin/env python3
+import csv
+import math
+import os
+import random
+import sys
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.backends.backend_pdf
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+
+# Make sure a legend has the same color across all generated graphs.
+def get_cmap(n, name="hsv"):
+ """Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
+ RGB color; the keyword argument name must be a standard mpl colormap name."""
+ return plt.cm.get_cmap(name, n)
+
+
+color_index = 0
+bar_color_maps = {}
+colors = []
+n_colors = 360
+linear_colors = get_cmap(n_colors)
+for i in range(n_colors):
+ colors.append(linear_colors(i))
+# Shuffle the colors so that adjacent bars in a graph are obvious to differentiate.
+random.shuffle(colors)
+
+
+def num_to_gb(n):
+ one_gb = 1024 * 1024 * 1024
+ if float(n) % one_gb == 0:
+ return "{}".format(n / one_gb)
+ # Keep two decimal points.
+ return "{0:.2f}".format(float(n) / one_gb)
+
+
+def plot_miss_stats_graphs(
+ csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name
+):
+ miss_ratios = {}
+ for file in os.listdir(csv_result_dir):
+ if not file.startswith(file_prefix):
+ continue
+ if not file.endswith(file_suffix):
+ continue
+ print("Processing file {}/{}".format(csv_result_dir, file))
+ mrc_file_path = csv_result_dir + "/" + file
+ with open(mrc_file_path, "r") as csvfile:
+ rows = csv.reader(csvfile, delimiter=",")
+ for row in rows:
+ cache_name = row[0]
+ num_shard_bits = int(row[1])
+ ghost_capacity = int(row[2])
+ capacity = int(row[3])
+ miss_ratio = float(row[4])
+ config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
+ if config not in miss_ratios:
+ miss_ratios[config] = {}
+ miss_ratios[config]["x"] = []
+ miss_ratios[config]["y"] = []
+ miss_ratios[config]["x"].append(capacity)
+ miss_ratios[config]["y"].append(miss_ratio)
+ fig = plt.figure()
+ for config in miss_ratios:
+ plt.plot(
+ miss_ratios[config]["x"], miss_ratios[config]["y"], label=config
+ )
+ plt.xlabel("Cache capacity")
+ plt.ylabel(ylabel)
+ plt.xscale("log", basex=2)
+ plt.ylim(ymin=0)
+ plt.title("{}".format(file))
+ plt.legend()
+ fig.savefig(
+ output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight"
+ )
+
+
+def plot_miss_stats_diff_lru_graphs(
+ csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name
+):
+ miss_ratios = {}
+ for file in os.listdir(csv_result_dir):
+ if not file.startswith(file_prefix):
+ continue
+ if not file.endswith(file_suffix):
+ continue
+ print("Processing file {}/{}".format(csv_result_dir, file))
+ mrc_file_path = csv_result_dir + "/" + file
+ with open(mrc_file_path, "r") as csvfile:
+ rows = csv.reader(csvfile, delimiter=",")
+ for row in rows:
+ cache_name = row[0]
+ num_shard_bits = int(row[1])
+ ghost_capacity = int(row[2])
+ capacity = int(row[3])
+ miss_ratio = float(row[4])
+ config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
+ if config not in miss_ratios:
+ miss_ratios[config] = {}
+ miss_ratios[config]["x"] = []
+ miss_ratios[config]["y"] = []
+ miss_ratios[config]["x"].append(capacity)
+ miss_ratios[config]["y"].append(miss_ratio)
+ if "lru-0-0" not in miss_ratios:
+ return
+ fig = plt.figure()
+ for config in miss_ratios:
+ diffs = [0] * len(miss_ratios["lru-0-0"]["x"])
+ for i in range(len(miss_ratios["lru-0-0"]["x"])):
+ for j in range(len(miss_ratios[config]["x"])):
+ if miss_ratios["lru-0-0"]["x"][i] == miss_ratios[config]["x"][j]:
+ diffs[i] = (
+ miss_ratios[config]["y"][j] - miss_ratios["lru-0-0"]["y"][i]
+ )
+ break
+ plt.plot(miss_ratios["lru-0-0"]["x"], diffs, label=config)
+ plt.xlabel("Cache capacity")
+ plt.ylabel(ylabel)
+ plt.xscale("log", basex=2)
+ plt.title("{}".format(file))
+ plt.legend()
+ fig.savefig(
+ output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight"
+ )
+
+
+def sanitize(label):
+ # matplotlib cannot plot legends that is prefixed with "_"
+ # so we need to remove them here.
+ index = 0
+ for i in range(len(label)):
+ if label[i] == "_":
+ index += 1
+ else:
+ break
+ data = label[index:]
+ # The value of uint64_max in c++.
+ if "18446744073709551615" in data:
+ return "max"
+ return data
+
+
+# Read the csv file vertically, i.e., group the data by columns.
+def read_data_for_plot_vertical(csvfile):
+ x = []
+ labels = []
+ label_stats = {}
+ csv_rows = csv.reader(csvfile, delimiter=",")
+ data_rows = []
+ for row in csv_rows:
+ data_rows.append(row)
+ # header
+ for i in range(1, len(data_rows[0])):
+ labels.append(sanitize(data_rows[0][i]))
+ label_stats[i - 1] = []
+ for i in range(1, len(data_rows)):
+ for j in range(len(data_rows[i])):
+ if j == 0:
+ x.append(sanitize(data_rows[i][j]))
+ continue
+ label_stats[j - 1].append(float(data_rows[i][j]))
+ return x, labels, label_stats
+
+
+# Read the csv file horizontally, i.e., group the data by rows.
+def read_data_for_plot_horizontal(csvfile):
+ x = []
+ labels = []
+ label_stats = {}
+ csv_rows = csv.reader(csvfile, delimiter=",")
+ data_rows = []
+ for row in csv_rows:
+ data_rows.append(row)
+ # header
+ for i in range(1, len(data_rows)):
+ labels.append(sanitize(data_rows[i][0]))
+ label_stats[i - 1] = []
+ for i in range(1, len(data_rows[0])):
+ x.append(sanitize(data_rows[0][i]))
+ for i in range(1, len(data_rows)):
+ for j in range(len(data_rows[i])):
+ if j == 0:
+ # label
+ continue
+ label_stats[i - 1].append(float(data_rows[i][j]))
+ return x, labels, label_stats
+
+
+def read_data_for_plot(csvfile, vertical):
+ if vertical:
+ return read_data_for_plot_vertical(csvfile)
+ return read_data_for_plot_horizontal(csvfile)
+
+
+def plot_line_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_prefix,
+ filename_suffix,
+ pdf_name,
+ xlabel,
+ ylabel,
+ title,
+ vertical,
+ legend,
+):
+ global color_index, bar_color_maps, colors
+ pdf = matplotlib.backends.backend_pdf.PdfPages(output_result_dir + "/" + pdf_name)
+ for file in os.listdir(csv_result_dir):
+ if not file.endswith(filename_suffix):
+ continue
+ if not file.startswith(filename_prefix):
+ continue
+ print("Processing file {}/{}".format(csv_result_dir, file))
+ with open(csv_result_dir + "/" + file, "r") as csvfile:
+ x, labels, label_stats = read_data_for_plot(csvfile, vertical)
+ if len(x) == 0 or len(labels) == 0:
+ continue
+ # plot figure
+ fig = plt.figure()
+ for label_index in label_stats:
+ # Assign a unique color to this label.
+ if labels[label_index] not in bar_color_maps:
+ bar_color_maps[labels[label_index]] = colors[color_index]
+ color_index += 1
+ plt.plot(
+ [int(x[i]) for i in range(len(x) - 1)],
+ label_stats[label_index][:-1],
+ label=labels[label_index],
+ color=bar_color_maps[labels[label_index]],
+ )
+
+ # Translate time unit into x labels.
+ if "_60" in file:
+ plt.xlabel("{} (Minute)".format(xlabel))
+ if "_3600" in file:
+ plt.xlabel("{} (Hour)".format(xlabel))
+ plt.ylabel(ylabel)
+ plt.title("{} {}".format(title, file))
+ if legend:
+ plt.legend()
+ pdf.savefig(fig)
+ pdf.close()
+
+
+def plot_stacked_bar_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_suffix,
+ pdf_name,
+ xlabel,
+ ylabel,
+ title,
+ vertical,
+ x_prefix,
+):
+ global color_index, bar_color_maps, colors
+ pdf = matplotlib.backends.backend_pdf.PdfPages(
+ "{}/{}".format(output_result_dir, pdf_name)
+ )
+ for file in os.listdir(csv_result_dir):
+ if not file.endswith(filename_suffix):
+ continue
+ with open(csv_result_dir + "/" + file, "r") as csvfile:
+ print("Processing file {}/{}".format(csv_result_dir, file))
+ x, labels, label_stats = read_data_for_plot(csvfile, vertical)
+ if len(x) == 0 or len(label_stats) == 0:
+ continue
+ # Plot figure
+ fig = plt.figure()
+ ind = np.arange(len(x)) # the x locations for the groups
+ width = 0.5 # the width of the bars: can also be len(x) sequence
+ bars = []
+ bottom_bars = []
+ for _i in label_stats[0]:
+ bottom_bars.append(0)
+ for i in range(0, len(label_stats)):
+ # Assign a unique color to this label.
+ if labels[i] not in bar_color_maps:
+ bar_color_maps[labels[i]] = colors[color_index]
+ color_index += 1
+ p = plt.bar(
+ ind,
+ label_stats[i],
+ width,
+ bottom=bottom_bars,
+ color=bar_color_maps[labels[i]],
+ )
+ bars.append(p[0])
+ for j in range(len(label_stats[i])):
+ bottom_bars[j] += label_stats[i][j]
+ plt.xlabel(xlabel)
+ plt.ylabel(ylabel)
+ plt.xticks(
+ ind, [x_prefix + x[i] for i in range(len(x))], rotation=20, fontsize=8
+ )
+ plt.legend(bars, labels)
+ plt.title("{} filename:{}".format(title, file))
+ pdf.savefig(fig)
+ pdf.close()
+
+
+def plot_heatmap(csv_result_dir, output_result_dir, filename_suffix, pdf_name, title):
+ pdf = matplotlib.backends.backend_pdf.PdfPages(
+ "{}/{}".format(output_result_dir, pdf_name)
+ )
+ for file in os.listdir(csv_result_dir):
+ if not file.endswith(filename_suffix):
+ continue
+ csv_file_name = "{}/{}".format(csv_result_dir, file)
+ print("Processing file {}/{}".format(csv_result_dir, file))
+ corr_table = pd.read_csv(csv_file_name)
+ corr_table = corr_table.pivot("label", "corr", "value")
+ fig = plt.figure()
+ sns.heatmap(corr_table, annot=True, linewidths=0.5, fmt=".2")
+ plt.title("{} filename:{}".format(title, file))
+ pdf.savefig(fig)
+ pdf.close()
+
+
+def plot_timeline(csv_result_dir, output_result_dir):
+ plot_line_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_prefix="",
+ filename_suffix="access_timeline",
+ pdf_name="access_time.pdf",
+ xlabel="Time",
+ ylabel="Throughput",
+ title="Access timeline with group by label",
+ vertical=False,
+ legend=True,
+ )
+
+
+def convert_to_0_if_nan(n):
+ if math.isnan(n):
+ return 0.0
+ return n
+
+
+def plot_correlation(csv_result_dir, output_result_dir):
+ # Processing the correlation input first.
+ label_str_file = {}
+ for file in os.listdir(csv_result_dir):
+ if not file.endswith("correlation_input"):
+ continue
+ csv_file_name = "{}/{}".format(csv_result_dir, file)
+ print("Processing file {}/{}".format(csv_result_dir, file))
+ corr_table = pd.read_csv(csv_file_name)
+ label_str = file.split("_")[0]
+ label = file[len(label_str) + 1 :]
+ label = label[: len(label) - len("_correlation_input")]
+
+ output_file = "{}/{}_correlation_output".format(csv_result_dir, label_str)
+ if output_file not in label_str_file:
+ f = open("{}/{}_correlation_output".format(csv_result_dir, label_str), "w+")
+ label_str_file[output_file] = f
+ f.write("label,corr,value\n")
+ f = label_str_file[output_file]
+ f.write(
+ "{},{},{}\n".format(
+ label,
+ "LA+A",
+ convert_to_0_if_nan(
+ corr_table["num_accesses_since_last_access"].corr(
+ corr_table["num_accesses_till_next_access"], method="spearman"
+ )
+ ),
+ )
+ )
+ f.write(
+ "{},{},{}\n".format(
+ label,
+ "PA+A",
+ convert_to_0_if_nan(
+ corr_table["num_past_accesses"].corr(
+ corr_table["num_accesses_till_next_access"], method="spearman"
+ )
+ ),
+ )
+ )
+ f.write(
+ "{},{},{}\n".format(
+ label,
+ "LT+A",
+ convert_to_0_if_nan(
+ corr_table["elapsed_time_since_last_access"].corr(
+ corr_table["num_accesses_till_next_access"], method="spearman"
+ )
+ ),
+ )
+ )
+ f.write(
+ "{},{},{}\n".format(
+ label,
+ "LA+T",
+ convert_to_0_if_nan(
+ corr_table["num_accesses_since_last_access"].corr(
+ corr_table["elapsed_time_till_next_access"], method="spearman"
+ )
+ ),
+ )
+ )
+ f.write(
+ "{},{},{}\n".format(
+ label,
+ "LT+T",
+ convert_to_0_if_nan(
+ corr_table["elapsed_time_since_last_access"].corr(
+ corr_table["elapsed_time_till_next_access"], method="spearman"
+ )
+ ),
+ )
+ )
+ f.write(
+ "{},{},{}\n".format(
+ label,
+ "PA+T",
+ convert_to_0_if_nan(
+ corr_table["num_past_accesses"].corr(
+ corr_table["elapsed_time_till_next_access"], method="spearman"
+ )
+ ),
+ )
+ )
+ for label_str in label_str_file:
+ label_str_file[label_str].close()
+
+ plot_heatmap(
+ csv_result_dir,
+ output_result_dir,
+ "correlation_output",
+ "correlation.pdf",
+ "Correlation",
+ )
+
+
+def plot_reuse_graphs(csv_result_dir, output_result_dir):
+ plot_stacked_bar_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_suffix="avg_reuse_interval_naccesses",
+ pdf_name="avg_reuse_interval_naccesses.pdf",
+ xlabel="",
+ ylabel="Percentage of accesses",
+ title="Average reuse interval",
+ vertical=True,
+ x_prefix="< ",
+ )
+ plot_stacked_bar_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_suffix="avg_reuse_interval",
+ pdf_name="avg_reuse_interval.pdf",
+ xlabel="",
+ ylabel="Percentage of blocks",
+ title="Average reuse interval",
+ vertical=True,
+ x_prefix="< ",
+ )
+ plot_stacked_bar_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_suffix="access_reuse_interval",
+ pdf_name="reuse_interval.pdf",
+ xlabel="Seconds",
+ ylabel="Percentage of accesses",
+ title="Reuse interval",
+ vertical=True,
+ x_prefix="< ",
+ )
+ plot_stacked_bar_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_suffix="reuse_lifetime",
+ pdf_name="reuse_lifetime.pdf",
+ xlabel="Seconds",
+ ylabel="Percentage of blocks",
+ title="Reuse lifetime",
+ vertical=True,
+ x_prefix="< ",
+ )
+ plot_line_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_prefix="",
+ filename_suffix="reuse_blocks_timeline",
+ pdf_name="reuse_blocks_timeline.pdf",
+ xlabel="",
+ ylabel="Percentage of blocks",
+ title="Reuse blocks timeline",
+ vertical=False,
+ legend=False,
+ )
+
+
+def plot_percentage_access_summary(csv_result_dir, output_result_dir):
+ plot_stacked_bar_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_suffix="percentage_of_accesses_summary",
+ pdf_name="percentage_access.pdf",
+ xlabel="",
+ ylabel="Percentage of accesses",
+ title="",
+ vertical=True,
+ x_prefix="",
+ )
+ plot_stacked_bar_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_suffix="percent_ref_keys",
+ pdf_name="percent_ref_keys.pdf",
+ xlabel="",
+ ylabel="Percentage of blocks",
+ title="",
+ vertical=True,
+ x_prefix="",
+ )
+ plot_stacked_bar_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_suffix="percent_data_size_on_ref_keys",
+ pdf_name="percent_data_size_on_ref_keys.pdf",
+ xlabel="",
+ ylabel="Percentage of blocks",
+ title="",
+ vertical=True,
+ x_prefix="",
+ )
+ plot_stacked_bar_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_suffix="percent_accesses_on_ref_keys",
+ pdf_name="percent_accesses_on_ref_keys.pdf",
+ xlabel="",
+ ylabel="Percentage of blocks",
+ title="",
+ vertical=True,
+ x_prefix="",
+ )
+
+
+def plot_access_count_summary(csv_result_dir, output_result_dir):
+ plot_stacked_bar_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_suffix="access_count_summary",
+ pdf_name="access_count_summary.pdf",
+ xlabel="Access count",
+ ylabel="Percentage of blocks",
+ title="",
+ vertical=True,
+ x_prefix="< ",
+ )
+ plot_line_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_prefix="",
+ filename_suffix="skewness",
+ pdf_name="skew.pdf",
+ xlabel="",
+ ylabel="Percentage of accesses",
+ title="Skewness",
+ vertical=True,
+ legend=False,
+ )
+
+
+def plot_miss_ratio_timeline(csv_result_dir, output_result_dir):
+ plot_line_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_prefix="",
+ filename_suffix="3600_miss_ratio_timeline",
+ pdf_name="miss_ratio_timeline.pdf",
+ xlabel="Time",
+ ylabel="Miss Ratio (%)",
+ title="Miss ratio timeline",
+ vertical=False,
+ legend=True,
+ )
+ plot_line_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_prefix="",
+ filename_suffix="3600_miss_timeline",
+ pdf_name="miss_timeline.pdf",
+ xlabel="Time",
+ ylabel="# of misses ",
+ title="Miss timeline",
+ vertical=False,
+ legend=True,
+ )
+ plot_line_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_prefix="",
+ filename_suffix="3600_miss_timeline",
+ pdf_name="miss_timeline.pdf",
+ xlabel="Time",
+ ylabel="# of misses ",
+ title="Miss timeline",
+ vertical=False,
+ legend=True,
+ )
+ plot_line_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_prefix="",
+ filename_suffix="3600_policy_timeline",
+ pdf_name="policy_timeline.pdf",
+ xlabel="Time",
+ ylabel="# of times a policy is selected ",
+ title="Policy timeline",
+ vertical=False,
+ legend=True,
+ )
+ plot_line_charts(
+ csv_result_dir,
+ output_result_dir,
+ filename_prefix="",
+ filename_suffix="3600_policy_ratio_timeline",
+ pdf_name="policy_ratio_timeline.pdf",
+ xlabel="Time",
+ ylabel="Percentage of times a policy is selected ",
+ title="Policy timeline",
+ vertical=False,
+ legend=True,
+ )
+
+
+if __name__ == "__main__":
+ if len(sys.argv) < 3:
+ print(
+ "Must provide two arguments: \n"
+ "1) The directory that saves a list of "
+ "directories which contain block cache trace analyzer result files. \n"
+ "2) the directory to save plotted graphs. \n"
+ )
+ exit(1)
+ csv_result_dir = sys.argv[1]
+ output_result_dir = sys.argv[2]
+ print(
+ "Processing directory {} and save graphs to {}.".format(
+ csv_result_dir, output_result_dir
+ )
+ )
+ for csv_relative_dir in os.listdir(csv_result_dir):
+ csv_abs_dir = csv_result_dir + "/" + csv_relative_dir
+ result_dir = output_result_dir + "/" + csv_relative_dir
+ if not os.path.isdir(csv_abs_dir):
+ print("{} is not a directory".format(csv_abs_dir))
+ continue
+ print("Processing experiment dir: {}".format(csv_relative_dir))
+ if not os.path.exists(result_dir):
+ os.makedirs(result_dir)
+ plot_access_count_summary(csv_abs_dir, result_dir)
+ plot_timeline(csv_abs_dir, result_dir)
+ plot_miss_ratio_timeline(csv_result_dir, output_result_dir)
+ plot_correlation(csv_abs_dir, result_dir)
+ plot_reuse_graphs(csv_abs_dir, result_dir)
+ plot_percentage_access_summary(csv_abs_dir, result_dir)
+ plot_miss_stats_graphs(
+ csv_abs_dir,
+ result_dir,
+ file_prefix="",
+ file_suffix="mrc",
+ ylabel="Miss ratio (%)",
+ pdf_file_name="mrc",
+ )
+ plot_miss_stats_diff_lru_graphs(
+ csv_abs_dir,
+ result_dir,
+ file_prefix="",
+ file_suffix="mrc",
+ ylabel="Miss ratio (%)",
+ pdf_file_name="mrc_diff_lru",
+ )
+ # The following stats are only available in pysim.
+ for time_unit in ["1", "60", "3600"]:
+ plot_miss_stats_graphs(
+ csv_abs_dir,
+ result_dir,
+ file_prefix="ml_{}_".format(time_unit),
+ file_suffix="p95mb",
+ ylabel="p95 number of byte miss per {} seconds".format(time_unit),
+ pdf_file_name="p95mb_per{}_seconds".format(time_unit),
+ )
+ plot_miss_stats_graphs(
+ csv_abs_dir,
+ result_dir,
+ file_prefix="ml_{}_".format(time_unit),
+ file_suffix="avgmb",
+ ylabel="Average number of byte miss per {} seconds".format(time_unit),
+ pdf_file_name="avgmb_per{}_seconds".format(time_unit),
+ )
+ plot_miss_stats_diff_lru_graphs(
+ csv_abs_dir,
+ result_dir,
+ file_prefix="ml_{}_".format(time_unit),
+ file_suffix="p95mb",
+ ylabel="p95 number of byte miss per {} seconds".format(time_unit),
+ pdf_file_name="p95mb_per{}_seconds_diff_lru".format(time_unit),
+ )
+ plot_miss_stats_diff_lru_graphs(
+ csv_abs_dir,
+ result_dir,
+ file_prefix="ml_{}_".format(time_unit),
+ file_suffix="avgmb",
+ ylabel="Average number of byte miss per {} seconds".format(time_unit),
+ pdf_file_name="avgmb_per{}_seconds_diff_lru".format(time_unit),
+ )
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
new file mode 100644
index 000000000..1dc723629
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
@@ -0,0 +1,717 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr,
+ "Please install gflags to run block_cache_trace_analyzer_test\n");
+ return 1;
+}
+#else
+
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const uint64_t kBlockSize = 1024;
+const std::string kBlockKeyPrefix = "test-block-";
+const uint32_t kCFId = 0;
+const uint32_t kLevel = 1;
+const uint64_t kSSTStoringEvenKeys = 100;
+const uint64_t kSSTStoringOddKeys = 101;
+const std::string kRefKeyPrefix = "test-get-";
+const uint64_t kNumKeysInBlock = 1024;
+const int kMaxArgCount = 100;
+const size_t kArgBufferSize = 100000;
+} // namespace
+
+class BlockCacheTracerTest : public testing::Test {
+ public:
+ BlockCacheTracerTest() {
+ test_path_ = test::PerThreadDBPath("block_cache_tracer_test");
+ env_ = ROCKSDB_NAMESPACE::Env::Default();
+ EXPECT_OK(env_->CreateDir(test_path_));
+ trace_file_path_ = test_path_ + "/block_cache_trace";
+ block_cache_sim_config_path_ = test_path_ + "/block_cache_sim_config";
+ timeline_labels_ =
+ "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller";
+ reuse_distance_labels_ =
+ "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller";
+ reuse_distance_buckets_ = "1,1K,1M,1G";
+ reuse_interval_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt";
+ reuse_interval_buckets_ = "1,10,100,1000";
+ reuse_lifetime_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt";
+ reuse_lifetime_buckets_ = "1,10,100,1000";
+ analyzing_callers_ = "Get,Iterator";
+ access_count_buckets_ = "2,3,4,5,10";
+ analyze_get_spatial_locality_labels_ = "all";
+ analyze_get_spatial_locality_buckets_ = "10,20,30,40,50,60,70,80,90,100";
+ }
+
+ ~BlockCacheTracerTest() override {
+ if (getenv("KEEP_DB")) {
+ printf("The trace file is still at %s\n", trace_file_path_.c_str());
+ return;
+ }
+ EXPECT_OK(env_->DeleteFile(trace_file_path_));
+ EXPECT_OK(env_->DeleteDir(test_path_));
+ }
+
+ TableReaderCaller GetCaller(uint32_t key_id) {
+ uint32_t n = key_id % 5;
+ switch (n) {
+ case 0:
+ return TableReaderCaller::kPrefetch;
+ case 1:
+ return TableReaderCaller::kCompaction;
+ case 2:
+ return TableReaderCaller::kUserGet;
+ case 3:
+ return TableReaderCaller::kUserMultiGet;
+ case 4:
+ return TableReaderCaller::kUserIterator;
+ }
+ // This cannot happend.
+ assert(false);
+ return TableReaderCaller::kMaxBlockCacheLookupCaller;
+ }
+
+ void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id,
+ TraceType block_type, uint32_t nblocks) {
+ assert(writer);
+ for (uint32_t i = 0; i < nblocks; i++) {
+ uint32_t key_id = from_key_id + i;
+ uint64_t timestamp = (key_id + 1) * kMicrosInSecond;
+ BlockCacheTraceRecord record;
+ record.block_type = block_type;
+ record.block_size = kBlockSize + key_id;
+ record.block_key = kBlockKeyPrefix + std::to_string(key_id);
+ record.access_timestamp = timestamp;
+ record.cf_id = kCFId;
+ record.cf_name = kDefaultColumnFamilyName;
+ record.caller = GetCaller(key_id);
+ record.level = kLevel;
+ if (key_id % 2 == 0) {
+ record.sst_fd_number = kSSTStoringEvenKeys;
+ } else {
+ record.sst_fd_number = kSSTStoringOddKeys;
+ }
+ record.is_cache_hit = Boolean::kFalse;
+ record.no_insert = Boolean::kFalse;
+ // Provide these fields for all block types.
+ // The writer should only write these fields for data blocks and the
+ // caller is either GET or MGET.
+ record.referenced_key =
+ kRefKeyPrefix + std::to_string(key_id) + std::string(8, 0);
+ record.referenced_key_exist_in_block = Boolean::kTrue;
+ record.num_keys_in_block = kNumKeysInBlock;
+ ASSERT_OK(writer->WriteBlockAccess(
+ record, record.block_key, record.cf_name, record.referenced_key));
+ }
+ }
+
+ void AssertBlockAccessInfo(
+ uint32_t key_id, TraceType type,
+ const std::map<std::string, BlockAccessInfo>& block_access_info_map) {
+ auto key_id_str = kBlockKeyPrefix + std::to_string(key_id);
+ ASSERT_TRUE(block_access_info_map.find(key_id_str) !=
+ block_access_info_map.end());
+ auto& block_access_info = block_access_info_map.find(key_id_str)->second;
+ ASSERT_EQ(1, block_access_info.num_accesses);
+ ASSERT_EQ(kBlockSize + key_id, block_access_info.block_size);
+ ASSERT_GT(block_access_info.first_access_time, 0);
+ ASSERT_GT(block_access_info.last_access_time, 0);
+ ASSERT_EQ(1, block_access_info.caller_num_access_map.size());
+ TableReaderCaller expected_caller = GetCaller(key_id);
+ ASSERT_TRUE(block_access_info.caller_num_access_map.find(expected_caller) !=
+ block_access_info.caller_num_access_map.end());
+ ASSERT_EQ(
+ 1,
+ block_access_info.caller_num_access_map.find(expected_caller)->second);
+
+ if ((expected_caller == TableReaderCaller::kUserGet ||
+ expected_caller == TableReaderCaller::kUserMultiGet) &&
+ type == TraceType::kBlockTraceDataBlock) {
+ ASSERT_EQ(kNumKeysInBlock, block_access_info.num_keys);
+ ASSERT_EQ(1, block_access_info.key_num_access_map.size());
+ ASSERT_EQ(0, block_access_info.non_exist_key_num_access_map.size());
+ ASSERT_EQ(1, block_access_info.num_referenced_key_exist_in_block);
+ }
+ }
+
+ void RunBlockCacheTraceAnalyzer() {
+ std::vector<std::string> params = {
+ "./block_cache_trace_analyzer",
+ "-block_cache_trace_path=" + trace_file_path_,
+ "-block_cache_sim_config_path=" + block_cache_sim_config_path_,
+ "-block_cache_analysis_result_dir=" + test_path_,
+ "-print_block_size_stats",
+ "-print_access_count_stats",
+ "-print_data_block_access_count_stats",
+ "-cache_sim_warmup_seconds=0",
+ "-analyze_bottom_k_access_count_blocks=5",
+ "-analyze_top_k_access_count_blocks=5",
+ "-analyze_blocks_reuse_k_reuse_window=5",
+ "-timeline_labels=" + timeline_labels_,
+ "-reuse_distance_labels=" + reuse_distance_labels_,
+ "-reuse_distance_buckets=" + reuse_distance_buckets_,
+ "-reuse_interval_labels=" + reuse_interval_labels_,
+ "-reuse_interval_buckets=" + reuse_interval_buckets_,
+ "-reuse_lifetime_labels=" + reuse_lifetime_labels_,
+ "-reuse_lifetime_buckets=" + reuse_lifetime_buckets_,
+ "-analyze_callers=" + analyzing_callers_,
+ "-access_count_buckets=" + access_count_buckets_,
+ "-analyze_get_spatial_locality_labels=" +
+ analyze_get_spatial_locality_labels_,
+ "-analyze_get_spatial_locality_buckets=" +
+ analyze_get_spatial_locality_buckets_,
+ "-analyze_correlation_coefficients_labels=all",
+ "-skew_labels=all",
+ "-skew_buckets=10,50,100"};
+ char arg_buffer[kArgBufferSize];
+ char* argv[kMaxArgCount];
+ int argc = 0;
+ int cursor = 0;
+ for (const auto& arg : params) {
+ ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+ ASSERT_LE(argc + 1, kMaxArgCount);
+ snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+ argv[argc++] = arg_buffer + cursor;
+ cursor += static_cast<int>(arg.size()) + 1;
+ }
+ ASSERT_EQ(0,
+ ROCKSDB_NAMESPACE::block_cache_trace_analyzer_tool(argc, argv));
+ }
+
+ Env* env_;
+ EnvOptions env_options_;
+ std::string block_cache_sim_config_path_;
+ std::string trace_file_path_;
+ std::string test_path_;
+ std::string timeline_labels_;
+ std::string reuse_distance_labels_;
+ std::string reuse_distance_buckets_;
+ std::string reuse_interval_labels_;
+ std::string reuse_interval_buckets_;
+ std::string reuse_lifetime_labels_;
+ std::string reuse_lifetime_buckets_;
+ std::string analyzing_callers_;
+ std::string access_count_buckets_;
+ std::string analyze_get_spatial_locality_labels_;
+ std::string analyze_get_spatial_locality_buckets_;
+};
+
+TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
+ {
+ // Generate a trace file.
+ TraceOptions trace_opt;
+ std::unique_ptr<TraceWriter> trace_writer;
+ ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+ &trace_writer));
+ BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+ ASSERT_OK(writer.WriteHeader());
+ WriteBlockAccess(&writer, 0, TraceType::kBlockTraceDataBlock, 50);
+ ASSERT_OK(env_->FileExists(trace_file_path_));
+ }
+ {
+ // Generate a cache sim config.
+ std::string config = "lru,1,0,1K,1M,1G";
+ std::ofstream out(block_cache_sim_config_path_);
+ ASSERT_TRUE(out.is_open());
+ out << config << std::endl;
+ out.close();
+ }
+ RunBlockCacheTraceAnalyzer();
+ {
+ // Validate the cache miss ratios.
+ std::vector<uint64_t> expected_capacities{1024, 1024 * 1024,
+ 1024 * 1024 * 1024};
+ const std::string mrc_path = test_path_ + "/49_50_mrc";
+ std::ifstream infile(mrc_path);
+ uint32_t config_index = 0;
+ std::string line;
+ // Read header.
+ ASSERT_TRUE(getline(infile, line));
+ while (getline(infile, line)) {
+ std::stringstream ss(line);
+ std::vector<std::string> result_strs;
+ while (ss.good()) {
+ std::string substr;
+ getline(ss, substr, ',');
+ result_strs.push_back(substr);
+ }
+ ASSERT_EQ(6, result_strs.size());
+ ASSERT_LT(config_index, expected_capacities.size());
+ ASSERT_EQ("lru", result_strs[0]); // cache_name
+ ASSERT_EQ("1", result_strs[1]); // num_shard_bits
+ ASSERT_EQ("0", result_strs[2]); // ghost_cache_capacity
+ ASSERT_EQ(std::to_string(expected_capacities[config_index]),
+ result_strs[3]); // cache_capacity
+ ASSERT_EQ("100.0000", result_strs[4]); // miss_ratio
+ ASSERT_EQ("50", result_strs[5]); // number of accesses.
+ config_index++;
+ }
+ ASSERT_EQ(expected_capacities.size(), config_index);
+ infile.close();
+ ASSERT_OK(env_->DeleteFile(mrc_path));
+
+ const std::vector<std::string> time_units{"1", "60", "3600"};
+ expected_capacities.push_back(port::kMaxUint64);
+ for (auto const& expected_capacity : expected_capacities) {
+ for (auto const& time_unit : time_units) {
+ const std::string miss_ratio_timeline_path =
+ test_path_ + "/" + std::to_string(expected_capacity) + "_" +
+ time_unit + "_miss_ratio_timeline";
+ std::ifstream mrt_file(miss_ratio_timeline_path);
+ // Read header.
+ ASSERT_TRUE(getline(mrt_file, line));
+ ASSERT_TRUE(getline(mrt_file, line));
+ std::stringstream ss(line);
+ bool read_header = false;
+ while (ss.good()) {
+ std::string substr;
+ getline(ss, substr, ',');
+ if (!read_header) {
+ if (expected_capacity == port::kMaxUint64) {
+ ASSERT_EQ("trace", substr);
+ } else {
+ ASSERT_EQ("lru-1-0", substr);
+ }
+ read_header = true;
+ continue;
+ }
+ ASSERT_DOUBLE_EQ(100.0, ParseDouble(substr));
+ }
+ ASSERT_FALSE(getline(mrt_file, line));
+ mrt_file.close();
+ ASSERT_OK(env_->DeleteFile(miss_ratio_timeline_path));
+ }
+ for (auto const& time_unit : time_units) {
+ const std::string miss_timeline_path =
+ test_path_ + "/" + std::to_string(expected_capacity) + "_" +
+ time_unit + "_miss_timeline";
+ std::ifstream mt_file(miss_timeline_path);
+ // Read header.
+ ASSERT_TRUE(getline(mt_file, line));
+ ASSERT_TRUE(getline(mt_file, line));
+ std::stringstream ss(line);
+ uint32_t num_misses = 0;
+ while (ss.good()) {
+ std::string substr;
+ getline(ss, substr, ',');
+ if (num_misses == 0) {
+ if (expected_capacity == port::kMaxUint64) {
+ ASSERT_EQ("trace", substr);
+ } else {
+ ASSERT_EQ("lru-1-0", substr);
+ }
+ num_misses++;
+ continue;
+ }
+ num_misses += ParseInt(substr);
+ }
+ ASSERT_EQ(51, num_misses);
+ ASSERT_FALSE(getline(mt_file, line));
+ mt_file.close();
+ ASSERT_OK(env_->DeleteFile(miss_timeline_path));
+ }
+ }
+ }
+ {
+ // Validate the skewness csv file.
+ const std::string skewness_file_path = test_path_ + "/all_skewness";
+ std::ifstream skew_file(skewness_file_path);
+ // Read header.
+ std::string line;
+ ASSERT_TRUE(getline(skew_file, line));
+ std::stringstream ss(line);
+ double sum_percent = 0;
+ while (getline(skew_file, line)) {
+ std::stringstream ss_naccess(line);
+ std::string substr;
+ bool read_label = false;
+ while (ss_naccess.good()) {
+ ASSERT_TRUE(getline(ss_naccess, substr, ','));
+ if (!read_label) {
+ read_label = true;
+ continue;
+ }
+ sum_percent += ParseDouble(substr);
+ }
+ }
+ ASSERT_EQ(100.0, sum_percent);
+ ASSERT_FALSE(getline(skew_file, line));
+ skew_file.close();
+ ASSERT_OK(env_->DeleteFile(skewness_file_path));
+ }
+ {
+ // Validate the timeline csv files.
+ const std::vector<std::string> time_units{"_60", "_3600"};
+ const std::vector<std::string> user_access_only_flags{"user_access_only_",
+ "all_access_"};
+ for (auto const& user_access_only : user_access_only_flags) {
+ for (auto const& unit : time_units) {
+ std::stringstream ss(timeline_labels_);
+ while (ss.good()) {
+ std::string l;
+ ASSERT_TRUE(getline(ss, l, ','));
+ if (l.find("block") == std::string::npos) {
+ if (user_access_only != "all_access_") {
+ continue;
+ }
+ }
+ const std::string timeline_file = test_path_ + "/" +
+ user_access_only + l + unit +
+ "_access_timeline";
+ std::ifstream infile(timeline_file);
+ std::string line;
+ const uint64_t expected_naccesses = 50;
+ const uint64_t expected_user_accesses = 30;
+ ASSERT_TRUE(getline(infile, line)) << timeline_file;
+ uint32_t naccesses = 0;
+ while (getline(infile, line)) {
+ std::stringstream ss_naccess(line);
+ std::string substr;
+ bool read_label = false;
+ while (ss_naccess.good()) {
+ ASSERT_TRUE(getline(ss_naccess, substr, ','));
+ if (!read_label) {
+ read_label = true;
+ continue;
+ }
+ naccesses += ParseUint32(substr);
+ }
+ }
+ if (user_access_only == "user_access_only_") {
+ ASSERT_EQ(expected_user_accesses, naccesses) << timeline_file;
+ } else {
+ ASSERT_EQ(expected_naccesses, naccesses) << timeline_file;
+ }
+ ASSERT_OK(env_->DeleteFile(timeline_file));
+ }
+ }
+ }
+ }
+ {
+ // Validate the reuse_interval and reuse_distance csv files.
+ std::map<std::string, std::string> test_reuse_csv_files;
+ test_reuse_csv_files["_access_reuse_interval"] = reuse_interval_labels_;
+ test_reuse_csv_files["_reuse_distance"] = reuse_distance_labels_;
+ test_reuse_csv_files["_reuse_lifetime"] = reuse_lifetime_labels_;
+ test_reuse_csv_files["_avg_reuse_interval"] = reuse_interval_labels_;
+ test_reuse_csv_files["_avg_reuse_interval_naccesses"] =
+ reuse_interval_labels_;
+ for (auto const& test : test_reuse_csv_files) {
+ const std::string& file_suffix = test.first;
+ const std::string& labels = test.second;
+ const uint32_t expected_num_rows = 5;
+ std::stringstream ss(labels);
+ while (ss.good()) {
+ std::string l;
+ ASSERT_TRUE(getline(ss, l, ','));
+ const std::string reuse_csv_file = test_path_ + "/" + l + file_suffix;
+ std::ifstream infile(reuse_csv_file);
+ std::string line;
+ ASSERT_TRUE(getline(infile, line));
+ double npercentage = 0;
+ uint32_t nrows = 0;
+ while (getline(infile, line)) {
+ std::stringstream ss_naccess(line);
+ bool label_read = false;
+ nrows++;
+ while (ss_naccess.good()) {
+ std::string substr;
+ ASSERT_TRUE(getline(ss_naccess, substr, ','));
+ if (!label_read) {
+ label_read = true;
+ continue;
+ }
+ npercentage += ParseDouble(substr);
+ }
+ }
+ ASSERT_EQ(expected_num_rows, nrows);
+ if ("_reuse_lifetime" == test.first ||
+ "_avg_reuse_interval" == test.first ||
+ "_avg_reuse_interval_naccesses" == test.first) {
+ ASSERT_EQ(100, npercentage) << reuse_csv_file;
+ } else {
+ ASSERT_LT(npercentage, 0);
+ }
+ ASSERT_OK(env_->DeleteFile(reuse_csv_file));
+ }
+ }
+ }
+
+ {
+ // Validate the percentage of accesses summary.
+ const std::string percent_access_summary_file =
+ test_path_ + "/percentage_of_accesses_summary";
+ std::ifstream infile(percent_access_summary_file);
+ std::string line;
+ ASSERT_TRUE(getline(infile, line));
+ std::set<std::string> callers;
+ std::set<std::string> expected_callers{"Get", "MultiGet", "Iterator",
+ "Prefetch", "Compaction"};
+ while (getline(infile, line)) {
+ std::stringstream caller_percent(line);
+ std::string caller;
+ ASSERT_TRUE(getline(caller_percent, caller, ','));
+ std::string percent;
+ ASSERT_TRUE(getline(caller_percent, percent, ','));
+ ASSERT_FALSE(caller_percent.good());
+ callers.insert(caller);
+ ASSERT_EQ(20, ParseDouble(percent));
+ }
+ ASSERT_EQ(expected_callers.size(), callers.size());
+ for (auto caller : callers) {
+ ASSERT_TRUE(expected_callers.find(caller) != expected_callers.end());
+ }
+ ASSERT_OK(env_->DeleteFile(percent_access_summary_file));
+ }
+ {
+ // Validate the percentage of accesses summary by analyzing callers.
+ std::stringstream analyzing_callers(analyzing_callers_);
+ while (analyzing_callers.good()) {
+ std::string caller;
+ ASSERT_TRUE(getline(analyzing_callers, caller, ','));
+ std::vector<std::string> breakdowns{"level", "bt"};
+ for (auto breakdown : breakdowns) {
+ const std::string file_name = test_path_ + "/" + caller + "_" +
+ breakdown +
+ "_percentage_of_accesses_summary";
+ std::ifstream infile(file_name);
+ std::string line;
+ ASSERT_TRUE(getline(infile, line));
+ double sum = 0;
+ while (getline(infile, line)) {
+ std::stringstream label_percent(line);
+ std::string label;
+ ASSERT_TRUE(getline(label_percent, label, ','));
+ std::string percent;
+ ASSERT_TRUE(getline(label_percent, percent, ','));
+ ASSERT_FALSE(label_percent.good());
+ sum += ParseDouble(percent);
+ }
+ ASSERT_EQ(100, sum);
+ ASSERT_OK(env_->DeleteFile(file_name));
+ }
+ }
+ }
+ const std::vector<std::string> access_types{"user_access_only", "all_access"};
+ const std::vector<std::string> prefix{"bt", "cf"};
+ for (auto const& pre : prefix) {
+ for (auto const& access_type : access_types) {
+ {
+ // Validate the access count summary.
+ const std::string bt_access_count_summary = test_path_ + "/" + pre +
+ "_" + access_type +
+ "_access_count_summary";
+ std::ifstream infile(bt_access_count_summary);
+ std::string line;
+ ASSERT_TRUE(getline(infile, line));
+ double sum_percent = 0;
+ while (getline(infile, line)) {
+ std::stringstream bt_percent(line);
+ std::string bt;
+ ASSERT_TRUE(getline(bt_percent, bt, ','));
+ std::string percent;
+ ASSERT_TRUE(getline(bt_percent, percent, ','));
+ sum_percent += ParseDouble(percent);
+ }
+ ASSERT_EQ(100.0, sum_percent);
+ ASSERT_OK(env_->DeleteFile(bt_access_count_summary));
+ }
+ }
+ }
+ for (auto const& access_type : access_types) {
+ std::vector<std::string> block_types{"Index", "Data", "Filter"};
+ for (auto block_type : block_types) {
+ // Validate reuse block timeline.
+ const std::string reuse_blocks_timeline = test_path_ + "/" + block_type +
+ "_" + access_type +
+ "_5_reuse_blocks_timeline";
+ std::ifstream infile(reuse_blocks_timeline);
+ std::string line;
+ ASSERT_TRUE(getline(infile, line)) << reuse_blocks_timeline;
+ uint32_t index = 0;
+ while (getline(infile, line)) {
+ std::stringstream timeline(line);
+ bool start_time = false;
+ double sum = 0;
+ while (timeline.good()) {
+ std::string value;
+ ASSERT_TRUE(getline(timeline, value, ','));
+ if (!start_time) {
+ start_time = true;
+ continue;
+ }
+ sum += ParseDouble(value);
+ }
+ index++;
+ ASSERT_LT(sum, 100.0 * index + 1) << reuse_blocks_timeline;
+ }
+ ASSERT_OK(env_->DeleteFile(reuse_blocks_timeline));
+ }
+ }
+
+ std::stringstream ss(analyze_get_spatial_locality_labels_);
+ while (ss.good()) {
+ std::string l;
+ ASSERT_TRUE(getline(ss, l, ','));
+ const std::vector<std::string> spatial_locality_files{
+ "_percent_ref_keys", "_percent_accesses_on_ref_keys",
+ "_percent_data_size_on_ref_keys"};
+ for (auto const& spatial_locality_file : spatial_locality_files) {
+ const std::string filename = test_path_ + "/" + l + spatial_locality_file;
+ std::ifstream infile(filename);
+ std::string line;
+ ASSERT_TRUE(getline(infile, line));
+ double sum_percent = 0;
+ uint32_t nrows = 0;
+ while (getline(infile, line)) {
+ std::stringstream bt_percent(line);
+ std::string bt;
+ ASSERT_TRUE(getline(bt_percent, bt, ','));
+ std::string percent;
+ ASSERT_TRUE(getline(bt_percent, percent, ','));
+ sum_percent += ParseDouble(percent);
+ nrows++;
+ }
+ ASSERT_EQ(11, nrows);
+ ASSERT_EQ(100.0, sum_percent);
+ ASSERT_OK(env_->DeleteFile(filename));
+ }
+ }
+ ASSERT_OK(env_->DeleteFile(block_cache_sim_config_path_));
+}
+
+TEST_F(BlockCacheTracerTest, MixedBlocks) {
+ {
+ // Generate a trace file containing a mix of blocks.
+ // It contains two SST files with 25 blocks of odd numbered block_key in
+ // kSSTStoringOddKeys and 25 blocks of even numbered blocks_key in
+ // kSSTStoringEvenKeys.
+ TraceOptions trace_opt;
+ std::unique_ptr<TraceWriter> trace_writer;
+ ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+ &trace_writer));
+ BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+ ASSERT_OK(writer.WriteHeader());
+ // Write blocks of different types.
+ WriteBlockAccess(&writer, 0, TraceType::kBlockTraceUncompressionDictBlock,
+ 10);
+ WriteBlockAccess(&writer, 10, TraceType::kBlockTraceDataBlock, 10);
+ WriteBlockAccess(&writer, 20, TraceType::kBlockTraceFilterBlock, 10);
+ WriteBlockAccess(&writer, 30, TraceType::kBlockTraceIndexBlock, 10);
+ WriteBlockAccess(&writer, 40, TraceType::kBlockTraceRangeDeletionBlock, 10);
+ ASSERT_OK(env_->FileExists(trace_file_path_));
+ }
+
+ {
+ // Verify trace file is generated correctly.
+ std::unique_ptr<TraceReader> trace_reader;
+ ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+ &trace_reader));
+ BlockCacheTraceReader reader(std::move(trace_reader));
+ BlockCacheTraceHeader header;
+ ASSERT_OK(reader.ReadHeader(&header));
+ ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
+ ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+ // Read blocks.
+ BlockCacheTraceAnalyzer analyzer(
+ trace_file_path_,
+ /*output_miss_ratio_curve_path=*/"",
+ /*human_readable_trace_file_path=*/"",
+ /*compute_reuse_distance=*/true,
+ /*mrc_only=*/false,
+ /*is_block_cache_human_readable_trace=*/false,
+ /*simulator=*/nullptr);
+ // The analyzer ends when it detects an incomplete access record.
+ ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze());
+ const uint64_t expected_num_cfs = 1;
+ std::vector<uint64_t> expected_fds{kSSTStoringOddKeys, kSSTStoringEvenKeys};
+ const std::vector<TraceType> expected_types{
+ TraceType::kBlockTraceUncompressionDictBlock,
+ TraceType::kBlockTraceDataBlock, TraceType::kBlockTraceFilterBlock,
+ TraceType::kBlockTraceIndexBlock,
+ TraceType::kBlockTraceRangeDeletionBlock};
+ const uint64_t expected_num_keys_per_type = 5;
+
+ auto& stats = analyzer.TEST_cf_aggregates_map();
+ ASSERT_EQ(expected_num_cfs, stats.size());
+ ASSERT_TRUE(stats.find(kDefaultColumnFamilyName) != stats.end());
+ auto& cf_stats = stats.find(kDefaultColumnFamilyName)->second;
+ ASSERT_EQ(expected_fds.size(), cf_stats.fd_aggregates_map.size());
+ for (auto fd_id : expected_fds) {
+ ASSERT_TRUE(cf_stats.fd_aggregates_map.find(fd_id) !=
+ cf_stats.fd_aggregates_map.end());
+ ASSERT_EQ(kLevel, cf_stats.fd_aggregates_map.find(fd_id)->second.level);
+ auto& block_type_aggregates_map = cf_stats.fd_aggregates_map.find(fd_id)
+ ->second.block_type_aggregates_map;
+ ASSERT_EQ(expected_types.size(), block_type_aggregates_map.size());
+ uint32_t key_id = 0;
+ for (auto type : expected_types) {
+ ASSERT_TRUE(block_type_aggregates_map.find(type) !=
+ block_type_aggregates_map.end());
+ auto& block_access_info_map =
+ block_type_aggregates_map.find(type)->second.block_access_info_map;
+ // Each block type has 5 blocks.
+ ASSERT_EQ(expected_num_keys_per_type, block_access_info_map.size());
+ for (uint32_t i = 0; i < 10; i++) {
+ // Verify that odd numbered blocks are stored in kSSTStoringOddKeys
+ // and even numbered blocks are stored in kSSTStoringEvenKeys.
+ auto key_id_str = kBlockKeyPrefix + std::to_string(key_id);
+ if (fd_id == kSSTStoringOddKeys) {
+ if (key_id % 2 == 1) {
+ AssertBlockAccessInfo(key_id, type, block_access_info_map);
+ } else {
+ ASSERT_TRUE(block_access_info_map.find(key_id_str) ==
+ block_access_info_map.end());
+ }
+ } else {
+ if (key_id % 2 == 1) {
+ ASSERT_TRUE(block_access_info_map.find(key_id_str) ==
+ block_access_info_map.end());
+ } else {
+ AssertBlockAccessInfo(key_id, type, block_access_info_map);
+ }
+ }
+ key_id++;
+ }
+ }
+ }
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+#endif // GFLAG
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "block_cache_trace_analyzer_test is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc
new file mode 100644
index 000000000..44fec5598
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+ return 1;
+}
+#else // GFLAGS
+#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
+int main(int argc, char** argv) {
+ return ROCKSDB_NAMESPACE::block_cache_trace_analyzer_tool(argc, argv);
+}
+#endif // GFLAGS
+#else // ROCKSDB_LITE
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "Not supported in lite mode.\n");
+ return 1;
+}
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/check_all_python.py b/src/rocksdb/tools/check_all_python.py
new file mode 100644
index 000000000..17fe95eab
--- /dev/null
+++ b/src/rocksdb/tools/check_all_python.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python2
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import glob
+
+# Checks that all python files in the repository are at least free of syntax
+# errors. This provides a minimal pre-/post-commit check for python file
+# modifications.
+
+filenames = []
+# Avoid scanning all of ./ because there might be other external repos
+# linked in.
+for base in ["buckifier", "build_tools", "coverage", "tools"]:
+ # Clean this up when we finally upgrade to Python 3
+ for suff in ["*", "*/*", "*/*/*"]:
+ filenames += glob.glob(base + "/" + suff + ".py")
+
+for filename in filenames:
+ source = open(filename, 'r').read() + '\n'
+ # Parses and syntax checks the file, throwing on error. (No pyc written.)
+ _ = compile(source, filename, 'exec')
+
+print("No syntax errors in {0} .py files".format(len(filenames)))
diff --git a/src/rocksdb/tools/check_format_compatible.sh b/src/rocksdb/tools/check_format_compatible.sh
new file mode 100755
index 000000000..98c2bb5c2
--- /dev/null
+++ b/src/rocksdb/tools/check_format_compatible.sh
@@ -0,0 +1,191 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# A shell script to load some pre generated data file to a DB using ldb tool
+# ./ldb needs to be avaible to be executed.
+#
+# Usage: <SCRIPT> [checkout]
+# `checkout` can be a tag, commit or branch name. Will build using it and check DBs generated by all previous branches (or tags for very old versions without branch) can be opened by it.
+# Return value 0 means all regression tests pass. 1 if not pass.
+
+scriptpath=`dirname $BASH_SOURCE`
+test_dir=${TEST_TMPDIR:-"/tmp"}"/format_compatible_check"
+script_copy_dir=$test_dir"/script_copy"
+input_data_path=$test_dir"/test_data_input/"
+
+mkdir $test_dir || true
+mkdir $input_data_path || true
+rm -rf $script_copy_dir
+cp $scriptpath $script_copy_dir -rf
+
+# Generate random files.
+for i in {1..6}
+do
+ input_data[$i]=$input_data_path/data$i
+ echo == Generating random input file ${input_data[$i]}
+ python - <<EOF
+import random
+random.seed($i)
+symbols=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+with open('${input_data[$i]}', 'w') as f:
+ for i in range(1,1024):
+ k = ""
+ for j in range(1, random.randint(1,32)):
+ k=k + symbols[random.randint(0, len(symbols) - 1)]
+ vb = ""
+ for j in range(1, random.randint(0,128)):
+ vb = vb + symbols[random.randint(0, len(symbols) - 1)]
+ v = ""
+ for j in range(1, random.randint(1, 5)):
+ v = v + vb
+ print >> f, k + " ==> " + v
+EOF
+done
+
+# Generate file(s) with sorted keys.
+sorted_input_data=$input_data_path/sorted_data
+echo == Generating file with sorted keys ${sorted_input_data}
+python - <<EOF
+with open('${sorted_input_data}', 'w') as f:
+ for i in range(0,10):
+ k = str(i)
+ v = "value" + k
+ print >> f, k + " ==> " + v
+EOF
+
+declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb")
+declare -a forward_compatible_checkout_objs=("4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
+declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
+declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
+declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
+
+generate_db()
+{
+ set +e
+ $script_copy_dir/generate_random_db.sh $1 $2
+ if [ $? -ne 0 ]; then
+ echo ==== Error loading data from $2 to $1 ====
+ exit 1
+ fi
+ set -e
+}
+
+compare_db()
+{
+ set +e
+ $script_copy_dir/verify_random_db.sh $1 $2 $3 $4 $5
+ if [ $? -ne 0 ]; then
+ echo ==== Read different content from $1 and $2 or error happened. ====
+ exit 1
+ fi
+ set -e
+}
+
+write_external_sst()
+{
+ set +e
+ $script_copy_dir/write_external_sst.sh $1 $2 $3
+ if [ $? -ne 0 ]; then
+ echo ==== Error writing external SST file using data from $1 to $3 ====
+ exit 1
+ fi
+ set -e
+}
+
+ingest_external_sst()
+{
+ set +e
+ $script_copy_dir/ingest_external_sst.sh $1 $2
+ if [ $? -ne 0 ]; then
+ echo ==== Error ingesting external SST in $2 to DB at $1 ====
+ exit 1
+ fi
+ set -e
+}
+
+# Sandcastle sets us up with a remote that is just another directory on the same
+# machine and doesn't have our branches. Need to fetch them so checkout works.
+# Remote add may fail if added previously (we don't cleanup).
+git remote add github_origin "https://github.com/facebook/rocksdb.git"
+set -e
+https_proxy="fwdproxy:8080" git fetch github_origin
+
+# Compatibility test for external SST file ingestion
+for checkout_obj in "${extern_sst_ingestion_compatible_checkout_objs[@]}"
+do
+ echo == Generating DB with extern SST file in "$checkout_obj" ...
+ https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_obj -b $checkout_obj
+ make clean
+ make ldb -j32
+ write_external_sst $input_data_path $test_dir/$checkout_obj $test_dir/$checkout_obj
+ ingest_external_sst $test_dir/$checkout_obj $test_dir/$checkout_obj
+done
+
+checkout_flag=${1:-"master"}
+
+echo == Building $checkout_flag debug
+https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_flag -b tmp-$checkout_flag
+make clean
+make ldb -j32
+compare_base_db_dir=$test_dir"/base_db_dir"
+write_external_sst $input_data_path $compare_base_db_dir $compare_base_db_dir
+ingest_external_sst $compare_base_db_dir $compare_base_db_dir
+
+for checkout_obj in "${extern_sst_ingestion_compatible_checkout_objs[@]}"
+do
+ echo == Build "$checkout_obj" and try to open DB generated using $checkout_flag
+ git checkout $checkout_obj
+ make clean
+ make ldb -j32
+ compare_db $test_dir/$checkout_obj $compare_base_db_dir db_dump.txt 1 1
+ git checkout tmp-$checkout_flag
+ # Clean up
+ git branch -D $checkout_obj
+done
+
+echo == Finish compatibility test for SST ingestion.
+
+for checkout_obj in "${checkout_objs[@]}"
+do
+ echo == Generating DB from "$checkout_obj" ...
+ https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_obj -b $checkout_obj
+ make clean
+ make ldb -j32
+ generate_db $input_data_path $test_dir/$checkout_obj
+done
+
+checkout_flag=${1:-"master"}
+
+echo == Building $checkout_flag debug
+git checkout tmp-$checkout_flag
+make clean
+make ldb -j32
+compare_base_db_dir=$test_dir"/base_db_dir"
+echo == Generate compare base DB to $compare_base_db_dir
+generate_db $input_data_path $compare_base_db_dir
+
+for checkout_obj in "${checkout_objs[@]}"
+do
+ echo == Opening DB from "$checkout_obj" using debug build of $checkout_flag ...
+ compare_db $test_dir/$checkout_obj $compare_base_db_dir db_dump.txt 1 0
+done
+
+for checkout_obj in "${forward_compatible_checkout_objs[@]}"
+do
+ echo == Build "$checkout_obj" and try to open DB generated using $checkout_flag...
+ git checkout $checkout_obj
+ make clean
+ make ldb -j32
+ compare_db $test_dir/$checkout_obj $compare_base_db_dir forward_${checkout_obj}_dump.txt 0
+done
+
+for checkout_obj in "${forward_compatible_with_options_checkout_objs[@]}"
+do
+ echo == Build "$checkout_obj" and try to open DB generated using $checkout_flag with its options...
+ git checkout $checkout_obj
+ make clean
+ make ldb -j32
+ compare_db $test_dir/$checkout_obj $compare_base_db_dir forward_${checkout_obj}_dump.txt 1 1
+done
+
+echo ==== Compatibility Test PASSED ====
diff --git a/src/rocksdb/tools/db_bench.cc b/src/rocksdb/tools/db_bench.cc
new file mode 100644
index 000000000..d4fb50c14
--- /dev/null
+++ b/src/rocksdb/tools/db_bench.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2013-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+ return 1;
+}
+#else
+#include <rocksdb/db_bench_tool.h>
+int main(int argc, char** argv) {
+ return ROCKSDB_NAMESPACE::db_bench_tool(argc, argv);
+}
+#endif // GFLAGS
diff --git a/src/rocksdb/tools/db_bench_tool.cc b/src/rocksdb/tools/db_bench_tool.cc
new file mode 100644
index 000000000..5c2ca01e6
--- /dev/null
+++ b/src/rocksdb/tools/db_bench_tool.cc
@@ -0,0 +1,7048 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef GFLAGS
+#ifdef NUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <atomic>
+#include <cinttypes>
+#include <condition_variable>
+#include <cstddef>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+
+#include "db/db_impl/db_impl.h"
+#include "db/malloc_stats.h"
+#include "db/version_set.h"
+#include "hdfs/env_hdfs.h"
+#include "monitoring/histogram.h"
+#include "monitoring/statistics.h"
+#include "options/cf_options.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/stats_history.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/options_util.h"
+#include "rocksdb/utilities/sim_cache.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
+#include "util/cast_util.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/gflags_compat.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/stderr_logger.h"
+#include "util/string_util.h"
+#include "util/xxhash.h"
+#include "utilities/blob_db/blob_db.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/bytesxor.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/persistent_cache/block_cache_tier.h"
+
+#ifdef OS_WIN
+#include <io.h> // open/close
+#endif
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+DEFINE_string(
+ benchmarks,
+ "fillseq,"
+ "fillseqdeterministic,"
+ "fillsync,"
+ "fillrandom,"
+ "filluniquerandomdeterministic,"
+ "overwrite,"
+ "readrandom,"
+ "newiterator,"
+ "newiteratorwhilewriting,"
+ "seekrandom,"
+ "seekrandomwhilewriting,"
+ "seekrandomwhilemerging,"
+ "readseq,"
+ "readreverse,"
+ "compact,"
+ "compactall,"
+ "multireadrandom,"
+ "mixgraph,"
+ "readseq,"
+ "readtorowcache,"
+ "readtocache,"
+ "readreverse,"
+ "readwhilewriting,"
+ "readwhilemerging,"
+ "readwhilescanning,"
+ "readrandomwriterandom,"
+ "updaterandom,"
+ "xorupdaterandom,"
+ "randomwithverify,"
+ "fill100K,"
+ "crc32c,"
+ "xxhash,"
+ "compress,"
+ "uncompress,"
+ "acquireload,"
+ "fillseekseq,"
+ "randomtransaction,"
+ "randomreplacekeys,"
+ "timeseries,"
+ "getmergeoperands",
+
+ "Comma-separated list of operations to run in the specified"
+ " order. Available benchmarks:\n"
+ "\tfillseq -- write N values in sequential key"
+ " order in async mode\n"
+ "\tfillseqdeterministic -- write N values in the specified"
+ " key order and keep the shape of the LSM tree\n"
+ "\tfillrandom -- write N values in random key order in async"
+ " mode\n"
+ "\tfilluniquerandomdeterministic -- write N values in a random"
+ " key order and keep the shape of the LSM tree\n"
+ "\toverwrite -- overwrite N values in random key order in"
+ " async mode\n"
+ "\tfillsync -- write N/1000 values in random key order in "
+ "sync mode\n"
+ "\tfill100K -- write N/1000 100K values in random order in"
+ " async mode\n"
+ "\tdeleteseq -- delete N keys in sequential order\n"
+ "\tdeleterandom -- delete N keys in random order\n"
+ "\treadseq -- read N times sequentially\n"
+ "\treadtocache -- 1 thread reading database sequentially\n"
+ "\treadreverse -- read N times in reverse order\n"
+ "\treadrandom -- read N times in random order\n"
+ "\treadmissing -- read N missing keys in random order\n"
+ "\treadwhilewriting -- 1 writer, N threads doing random "
+ "reads\n"
+ "\treadwhilemerging -- 1 merger, N threads doing random "
+ "reads\n"
+ "\treadwhilescanning -- 1 thread doing full table scan, "
+ "N threads doing random reads\n"
+ "\treadrandomwriterandom -- N threads doing random-read, "
+ "random-write\n"
+ "\tupdaterandom -- N threads doing read-modify-write for random "
+ "keys\n"
+ "\txorupdaterandom -- N threads doing read-XOR-write for "
+ "random keys\n"
+ "\tappendrandom -- N threads doing read-modify-write with "
+ "growing values\n"
+ "\tmergerandom -- same as updaterandom/appendrandom using merge"
+ " operator. "
+ "Must be used with merge_operator\n"
+ "\treadrandommergerandom -- perform N random read-or-merge "
+ "operations. Must be used with merge_operator\n"
+ "\tnewiterator -- repeated iterator creation\n"
+ "\tseekrandom -- N random seeks, call Next seek_nexts times "
+ "per seek\n"
+ "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
+ "overwrite\n"
+ "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
+ "merge\n"
+ "\tcrc32c -- repeated crc32c of 4K of data\n"
+ "\txxhash -- repeated xxHash of 4K of data\n"
+ "\tacquireload -- load N*1000 times\n"
+ "\tfillseekseq -- write N values in sequential key, then read "
+ "them by seeking to each key\n"
+ "\trandomtransaction -- execute N random transactions and "
+ "verify correctness\n"
+ "\trandomreplacekeys -- randomly replaces N keys by deleting "
+ "the old version and putting the new version\n\n"
+ "\ttimeseries -- 1 writer generates time series data "
+ "and multiple readers doing random reads on id\n\n"
+ "Meta operations:\n"
+ "\tcompact -- Compact the entire DB; If multiple, randomly choose one\n"
+ "\tcompactall -- Compact the entire DB\n"
+ "\tstats -- Print DB stats\n"
+ "\tresetstats -- Reset DB stats\n"
+ "\tlevelstats -- Print the number of files and bytes per level\n"
+ "\tsstables -- Print sstable info\n"
+ "\theapprofile -- Dump a heap profile (if supported by this port)\n"
+ "\treplay -- replay the trace file specified with trace_file\n"
+ "\tgetmergeoperands -- Insert lots of merge records which are a list of "
+ "sorted ints for a key and then compare performance of lookup for another "
+ "key "
+ "by doing a Get followed by binary searching in the large sorted list vs "
+ "doing a GetMergeOperands and binary searching in the operands which are"
+ "sorted sub-lists. The MergeOperator used is sortlist.h\n");
+
+DEFINE_int64(num, 1000000, "Number of key/values to place in database");
+
+DEFINE_int64(numdistinct, 1000,
+ "Number of distinct keys to use. Used in RandomWithVerify to "
+ "read/write on fewer keys so that gets are more likely to find the"
+ " key and puts are more likely to update the same key");
+
+DEFINE_int64(merge_keys, -1,
+ "Number of distinct keys to use for MergeRandom and "
+ "ReadRandomMergeRandom. "
+ "If negative, there will be FLAGS_num keys.");
+DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
+
+DEFINE_int32(
+ num_hot_column_families, 0,
+ "Number of Hot Column Families. If more than 0, only write to this "
+ "number of column families. After finishing all the writes to them, "
+ "create new set of column families and insert to them. Only used "
+ "when num_column_families > 1.");
+
+DEFINE_string(column_family_distribution, "",
+ "Comma-separated list of percentages, where the ith element "
+ "indicates the probability of an op using the ith column family. "
+ "The number of elements must be `num_hot_column_families` if "
+ "specified; otherwise, it must be `num_column_families`. The "
+ "sum of elements must be 100. E.g., if `num_column_families=4`, "
+ "and `num_hot_column_families=0`, a valid list could be "
+ "\"10,20,30,40\".");
+
+DEFINE_int64(reads, -1, "Number of read operations to do. "
+ "If negative, do FLAGS_num reads.");
+
+DEFINE_int64(deletes, -1, "Number of delete operations to do. "
+ "If negative, do FLAGS_num deletions.");
+
+DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");
+
+DEFINE_int64(seed, 0, "Seed base for random number generators. "
+ "When 0 it is deterministic.");
+
+DEFINE_int32(threads, 1, "Number of concurrent threads to run.");
+
+DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."
+ " When 0 then num & reads determine the test duration");
+
+DEFINE_string(value_size_distribution_type, "fixed",
+ "Value size distribution type: fixed, uniform, normal");
+
+DEFINE_int32(value_size, 100, "Size of each value in fixed distribution");
+static unsigned int value_size = 100;
+
+DEFINE_int32(value_size_min, 100, "Min size of random value");
+
+DEFINE_int32(value_size_max, 102400, "Max size of random value");
+
+DEFINE_int32(seek_nexts, 0,
+ "How many times to call Next() after Seek() in "
+ "fillseekseq, seekrandom, seekrandomwhilewriting and "
+ "seekrandomwhilemerging");
+
+DEFINE_bool(reverse_iterator, false,
+ "When true use Prev rather than Next for iterators that do "
+ "Seek and then Next");
+
+DEFINE_int64(max_scan_distance, 0,
+ "Used to define iterate_upper_bound (or iterate_lower_bound "
+ "if FLAGS_reverse_iterator is set to true) when value is nonzero");
+
+DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
+
+DEFINE_int64(batch_size, 1, "Batch size");
+
+static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
+ return true;
+}
+
+static bool ValidateUint32Range(const char* flagname, uint64_t value) {
+ if (value > std::numeric_limits<uint32_t>::max()) {
+ fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
+ (unsigned long)value);
+ return false;
+ }
+ return true;
+}
+
+DEFINE_int32(key_size, 16, "size of each key");
+
+DEFINE_int32(num_multi_db, 0,
+ "Number of DBs used in the benchmark. 0 means single DB.");
+
+DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
+ " to this fraction of their original size after compression");
+
+DEFINE_double(read_random_exp_range, 0.0,
+ "Read random's key will be generated using distribution of "
+ "num * exp(-r) where r is uniform number from 0 to this value. "
+ "The larger the number is, the more skewed the reads are. "
+ "Only used in readrandom and multireadrandom benchmarks.");
+
+DEFINE_bool(histogram, false, "Print histogram of operation timings");
+
+DEFINE_bool(enable_numa, false,
+ "Make operations aware of NUMA architecture and bind memory "
+ "and cpus corresponding to nodes together. In NUMA, memory "
+ "in same node as CPUs are closer when compared to memory in "
+ "other nodes. Reads can be faster when the process is bound to "
+ "CPU and memory of same node. Use \"$numactl --hardware\" command "
+ "to see NUMA memory architecture.");
+
+DEFINE_int64(db_write_buffer_size,
+ ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
+ "Number of bytes to buffer in all memtables before compacting");
+
+DEFINE_bool(cost_write_buffer_to_cache, false,
+ "The usage of memtable is costed to the block cache");
+
+DEFINE_int64(write_buffer_size, ROCKSDB_NAMESPACE::Options().write_buffer_size,
+ "Number of bytes to buffer in memtable before compacting");
+
+DEFINE_int32(max_write_buffer_number,
+ ROCKSDB_NAMESPACE::Options().max_write_buffer_number,
+ "The number of in-memory memtables. Each memtable is of size"
+ " write_buffer_size bytes.");
+
+DEFINE_int32(min_write_buffer_number_to_merge,
+ ROCKSDB_NAMESPACE::Options().min_write_buffer_number_to_merge,
+ "The minimum number of write buffers that will be merged together"
+ "before writing to storage. This is cheap because it is an"
+ "in-memory merge. If this feature is not enabled, then all these"
+ "write buffers are flushed to L0 as separate files and this "
+ "increases read amplification because a get request has to check"
+ " in all of these files. Also, an in-memory merge may result in"
+ " writing less data to storage if there are duplicate records "
+ " in each of these individual write buffers.");
+
+DEFINE_int32(max_write_buffer_number_to_maintain,
+ ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
+ "The total maximum number of write buffers to maintain in memory "
+ "including copies of buffers that have already been flushed. "
+ "Unlike max_write_buffer_number, this parameter does not affect "
+ "flushing. This controls the minimum amount of write history "
+ "that will be available in memory for conflict checking when "
+ "Transactions are used. If this value is too low, some "
+ "transactions may fail at commit time due to not being able to "
+ "determine whether there were any write conflicts. Setting this "
+ "value to 0 will cause write buffers to be freed immediately "
+ "after they are flushed. If this value is set to -1, "
+ "'max_write_buffer_number' will be used.");
+
+DEFINE_int64(max_write_buffer_size_to_maintain,
+ ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
+ "The total maximum size of write buffers to maintain in memory "
+ "including copies of buffers that have already been flushed. "
+ "Unlike max_write_buffer_number, this parameter does not affect "
+ "flushing. This controls the minimum amount of write history "
+ "that will be available in memory for conflict checking when "
+ "Transactions are used. If this value is too low, some "
+ "transactions may fail at commit time due to not being able to "
+ "determine whether there were any write conflicts. Setting this "
+ "value to 0 will cause write buffers to be freed immediately "
+ "after they are flushed. If this value is set to -1, "
+ "'max_write_buffer_number' will be used.");
+
+DEFINE_int32(max_background_jobs,
+ ROCKSDB_NAMESPACE::Options().max_background_jobs,
+ "The maximum number of concurrent background jobs that can occur "
+ "in parallel.");
+
+DEFINE_int32(num_bottom_pri_threads, 0,
+ "The number of threads in the bottom-priority thread pool (used "
+ "by universal compaction only).");
+
+DEFINE_int32(num_high_pri_threads, 0,
+ "The maximum number of concurrent background compactions"
+ " that can occur in parallel.");
+
+DEFINE_int32(num_low_pri_threads, 0,
+ "The maximum number of concurrent background compactions"
+ " that can occur in parallel.");
+
+DEFINE_int32(max_background_compactions,
+ ROCKSDB_NAMESPACE::Options().max_background_compactions,
+ "The maximum number of concurrent background compactions"
+ " that can occur in parallel.");
+
+DEFINE_int32(base_background_compactions, -1, "DEPRECATED");
+
+DEFINE_uint64(subcompactions, 1,
+ "Maximum number of subcompactions to divide L0-L1 compactions "
+ "into.");
+static const bool FLAGS_subcompactions_dummy
+ __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_subcompactions,
+ &ValidateUint32Range);
+
+DEFINE_int32(max_background_flushes,
+ ROCKSDB_NAMESPACE::Options().max_background_flushes,
+ "The maximum number of concurrent background flushes"
+ " that can occur in parallel.");
+
+static ROCKSDB_NAMESPACE::CompactionStyle FLAGS_compaction_style_e;
+DEFINE_int32(compaction_style,
+ (int32_t)ROCKSDB_NAMESPACE::Options().compaction_style,
+ "style of compaction: level-based, universal and fifo");
+
+static ROCKSDB_NAMESPACE::CompactionPri FLAGS_compaction_pri_e;
+DEFINE_int32(compaction_pri,
+ (int32_t)ROCKSDB_NAMESPACE::Options().compaction_pri,
+ "priority of files to compaction: by size or by data age");
+
+DEFINE_int32(universal_size_ratio, 0,
+ "Percentage flexibility while comparing file size"
+ " (for universal compaction only).");
+
+DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files in a"
+ " single compaction run (for universal compaction only).");
+
+DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
+ " in universal style compaction");
+
+DEFINE_int32(universal_max_size_amplification_percent, 0,
+ "The max size amplification for universal style compaction");
+
+DEFINE_int32(universal_compression_size_percent, -1,
+ "The percentage of the database to compress for universal "
+ "compaction. -1 means compress everything.");
+
+DEFINE_bool(universal_allow_trivial_move, false,
+ "Allow trivial move in universal compaction.");
+
+DEFINE_int64(cache_size, 8 << 20, // 8MB
+ "Number of bytes to use as a cache of uncompressed data");
+
+DEFINE_int32(cache_numshardbits, 6,
+ "Number of shards for the block cache"
+ " is 2 ** cache_numshardbits. Negative means use default settings."
+ " This is applied only if FLAGS_cache_size is non-negative.");
+
+DEFINE_double(cache_high_pri_pool_ratio, 0.0,
+ "Ratio of block cache reserve for high pri blocks. "
+ "If > 0.0, we also enable "
+ "cache_index_and_filter_blocks_with_high_priority.");
+
+DEFINE_bool(use_clock_cache, false,
+ "Replace default LRU block cache with clock cache.");
+
+DEFINE_int64(simcache_size, -1,
+ "Number of bytes to use as a simcache of "
+ "uncompressed data. Nagative value disables simcache.");
+
+DEFINE_bool(cache_index_and_filter_blocks, false,
+ "Cache index/filter blocks in block cache.");
+
+DEFINE_bool(partition_index_and_filters, false,
+ "Partition index and filter blocks.");
+
+DEFINE_bool(partition_index, false, "Partition index blocks");
+
+DEFINE_int64(metadata_block_size,
+ ROCKSDB_NAMESPACE::BlockBasedTableOptions().metadata_block_size,
+ "Max partition size when partitioning index/filters");
+
+// The default reduces the overhead of reading time with flash. With HDD, which
+// offers much less throughput, however, this number better to be set to 1.
+DEFINE_int32(ops_between_duration_checks, 1000,
+ "Check duration limit every x ops");
+
+DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false,
+ "Pin index/filter blocks of L0 files in block cache.");
+
+DEFINE_bool(
+ pin_top_level_index_and_filter, false,
+ "Pin top-level index of partitioned index/filter blocks in block cache.");
+
+DEFINE_int32(block_size,
+ static_cast<int32_t>(
+ ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size),
+ "Number of bytes in a block.");
+
+DEFINE_int32(format_version,
+ static_cast<int32_t>(
+ ROCKSDB_NAMESPACE::BlockBasedTableOptions().format_version),
+ "Format version of SST files.");
+
+DEFINE_int32(block_restart_interval,
+ ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_restart_interval,
+ "Number of keys between restart points "
+ "for delta encoding of keys in data block.");
+
+DEFINE_int32(
+ index_block_restart_interval,
+ ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_block_restart_interval,
+ "Number of keys between restart points "
+ "for delta encoding of keys in index block.");
+
+DEFINE_int32(read_amp_bytes_per_bit,
+ ROCKSDB_NAMESPACE::BlockBasedTableOptions().read_amp_bytes_per_bit,
+ "Number of bytes per bit to be used in block read-amp bitmap");
+
+DEFINE_bool(
+ enable_index_compression,
+ ROCKSDB_NAMESPACE::BlockBasedTableOptions().enable_index_compression,
+ "Compress the index block");
+
+DEFINE_bool(block_align,
+ ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
+ "Align data blocks on page size");
+
+DEFINE_bool(use_data_block_hash_index, false,
+ "if use kDataBlockBinaryAndHash "
+ "instead of kDataBlockBinarySearch. "
+ "This is valid if only we use BlockTable");
+
+DEFINE_double(data_block_hash_table_util_ratio, 0.75,
+ "util ratio for data block hash index table. "
+ "This is only valid if use_data_block_hash_index is "
+ "set to true");
+
+DEFINE_int64(compressed_cache_size, -1,
+ "Number of bytes to use as a cache of compressed data.");
+
+DEFINE_int64(row_cache_size, 0,
+ "Number of bytes to use as a cache of individual rows"
+ " (0 = disabled).");
+
+DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files,
+ "Maximum number of files to keep open at the same time"
+ " (use default if == 0)");
+
+DEFINE_int32(file_opening_threads,
+ ROCKSDB_NAMESPACE::Options().max_file_opening_threads,
+ "If open_files is set to -1, this option set the number of "
+ "threads that will be used to open files during DB::Open()");
+
+DEFINE_bool(new_table_reader_for_compaction_inputs, true,
+ "If true, uses a separate file handle for compaction inputs");
+
+DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
+
+DEFINE_int32(log_readahead_size, 0, "WAL and manifest readahead size");
+
+DEFINE_int32(random_access_max_buffer_size, 1024 * 1024,
+ "Maximum windows randomaccess buffer size");
+
+DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
+ "Maximum write buffer for Writable File");
+
+DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
+ " use default settings.");
+DEFINE_double(memtable_bloom_size_ratio, 0,
+ "Ratio of memtable size used for bloom filter. 0 means no bloom "
+ "filter.");
+DEFINE_bool(memtable_whole_key_filtering, false,
+ "Try to use whole key bloom filter in memtables.");
+DEFINE_bool(memtable_use_huge_page, false,
+ "Try to use huge page in memtables.");
+
+DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing"
+ " database. If you set this flag and also specify a benchmark that"
+ " wants a fresh database, that benchmark will fail.");
+
+DEFINE_bool(use_existing_keys, false,
+ "If true, uses existing keys in the DB, "
+ "rather than generating new ones. This involves some startup "
+ "latency to load all keys into memory. It is supported for the "
+ "same read/overwrite benchmarks as `-use_existing_db=true`, which "
+ "must also be set for this flag to be enabled. When this flag is "
+ "set, the value for `-num` will be ignored.");
+
+DEFINE_bool(show_table_properties, false,
+ "If true, then per-level table"
+ " properties will be printed on every stats-interval when"
+ " stats_interval is set and stats_per_interval is on.");
+
+DEFINE_string(db, "", "Use the db with the following name.");
+
+// Read cache flags
+
+DEFINE_string(read_cache_path, "",
+ "If not empty string, a read cache will be used in this path");
+
+DEFINE_int64(read_cache_size, 4LL * 1024 * 1024 * 1024,
+ "Maximum size of the read cache");
+
+DEFINE_bool(read_cache_direct_write, true,
+ "Whether to use Direct IO for writing to the read cache");
+
+DEFINE_bool(read_cache_direct_read, true,
+ "Whether to use Direct IO for reading from read cache");
+
+DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter");
+
+static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
+ if (value >= 20) {
+ fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n",
+ flagname, value);
+ return false;
+ }
+ return true;
+}
+
+DEFINE_bool(verify_checksum, true,
+ "Verify checksum for every block read"
+ " from storage");
+
+DEFINE_bool(statistics, false, "Database statistics");
+DEFINE_int32(stats_level, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers,
+ "stats level for statistics");
+DEFINE_string(statistics_string, "", "Serialized statistics string");
+static class std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
+
+DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do"
+ " --num reads.");
+
+DEFINE_bool(finish_after_writes, false, "Write thread terminates after all writes are finished");
+
+DEFINE_bool(sync, false, "Sync all writes to disk");
+
+DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
+
+DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
+
+DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
+
+DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench",
+ "Truth key/values used when using verify");
+
+DEFINE_int32(num_levels, 7, "The total number of levels");
+
+DEFINE_int64(target_file_size_base,
+ ROCKSDB_NAMESPACE::Options().target_file_size_base,
+ "Target file size at level-1");
+
+DEFINE_int32(target_file_size_multiplier,
+ ROCKSDB_NAMESPACE::Options().target_file_size_multiplier,
+ "A multiplier to compute target level-N file size (N >= 2)");
+
+DEFINE_uint64(max_bytes_for_level_base,
+ ROCKSDB_NAMESPACE::Options().max_bytes_for_level_base,
+ "Max bytes for level-1");
+
+DEFINE_bool(level_compaction_dynamic_level_bytes, false,
+ "Whether level size base is dynamic");
+
+DEFINE_double(max_bytes_for_level_multiplier, 10,
+ "A multiplier to compute max bytes for level-N (N >= 2)");
+
+static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
+DEFINE_string(max_bytes_for_level_multiplier_additional, "",
+ "A vector that specifies additional fanout per level");
+
+DEFINE_int32(level0_stop_writes_trigger,
+ ROCKSDB_NAMESPACE::Options().level0_stop_writes_trigger,
+ "Number of files in level-0"
+ " that will trigger put stop.");
+
+DEFINE_int32(level0_slowdown_writes_trigger,
+ ROCKSDB_NAMESPACE::Options().level0_slowdown_writes_trigger,
+ "Number of files in level-0"
+ " that will slow down writes.");
+
+DEFINE_int32(level0_file_num_compaction_trigger,
+ ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger,
+ "Number of files in level-0"
+ " when compactions start");
+
+static bool ValidateInt32Percent(const char* flagname, int32_t value) {
+ if (value <= 0 || value>=100) {
+ fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n",
+ flagname, value);
+ return false;
+ }
+ return true;
+}
+DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed"
+ " as percentage) for the ReadRandomWriteRandom workload. The "
+ "default value 90 means 90% operations out of all reads and writes"
+ " operations are reads. In other words, 9 gets for every 1 put.");
+
+DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed"
+ " as percentage) for the ReadRandomMergeRandom workload. The"
+ " default value 70 means 70% out of all read and merge operations"
+ " are merges. In other words, 7 merges for every 3 gets.");
+
+DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
+ "deletes (used in RandomWithVerify only). RandomWithVerify "
+ "calculates writepercent as (100 - FLAGS_readwritepercent - "
+ "deletepercent), so deletepercent must be smaller than (100 - "
+ "FLAGS_readwritepercent)");
+
+DEFINE_bool(optimize_filters_for_hits, false,
+ "Optimizes bloom filters for workloads for most lookups return "
+ "a value. For now this doesn't create bloom filters for the max "
+ "level of the LSM to reduce metadata that should fit in RAM. ");
+
+DEFINE_uint64(delete_obsolete_files_period_micros, 0,
+ "Ignored. Left here for backward compatibility");
+
+DEFINE_int64(writes_before_delete_range, 0,
+ "Number of writes before DeleteRange is called regularly.");
+
+DEFINE_int64(writes_per_range_tombstone, 0,
+ "Number of writes between range tombstones");
+
+DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range");
+
+DEFINE_int64(max_num_range_tombstones, 0,
+ "Maximum number of range tombstones "
+ "to insert.");
+
+DEFINE_bool(expand_range_tombstones, false,
+ "Expand range tombstone into sequential regular tombstones.");
+
+#ifndef ROCKSDB_LITE
+// Transactions Options
+DEFINE_bool(optimistic_transaction_db, false,
+ "Open a OptimisticTransactionDB instance. "
+ "Required for randomtransaction benchmark.");
+
+DEFINE_bool(transaction_db, false,
+ "Open a TransactionDB instance. "
+ "Required for randomtransaction benchmark.");
+
+DEFINE_uint64(transaction_sets, 2,
+ "Number of keys each transaction will "
+ "modify (use in RandomTransaction only). Max: 9999");
+
+DEFINE_bool(transaction_set_snapshot, false,
+ "Setting to true will have each transaction call SetSnapshot()"
+ " upon creation.");
+
+DEFINE_int32(transaction_sleep, 0,
+ "Max microseconds to sleep in between "
+ "reading and writing a value (used in RandomTransaction only). ");
+
+DEFINE_uint64(transaction_lock_timeout, 100,
+ "If using a transaction_db, specifies the lock wait timeout in"
+ " milliseconds before failing a transaction waiting on a lock");
+DEFINE_string(
+ options_file, "",
+ "The path to a RocksDB options file. If specified, then db_bench will "
+ "run with the RocksDB options in the default column family of the "
+ "specified options file. "
+ "Note that with this setting, db_bench will ONLY accept the following "
+ "RocksDB options related command-line arguments, all other arguments "
+ "that are related to RocksDB options will be ignored:\n"
+ "\t--use_existing_db\n"
+ "\t--use_existing_keys\n"
+ "\t--statistics\n"
+ "\t--row_cache_size\n"
+ "\t--row_cache_numshardbits\n"
+ "\t--enable_io_prio\n"
+ "\t--dump_malloc_stats\n"
+ "\t--num_multi_db\n");
+
+// FIFO Compaction Options
+DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0,
+ "The limit of total table file sizes to trigger FIFO compaction");
+
+DEFINE_bool(fifo_compaction_allow_compaction, true,
+ "Allow compaction in FIFO compaction.");
+
+DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
+
+// Blob DB Options
+DEFINE_bool(use_blob_db, false,
+ "Open a BlobDB instance. "
+ "Required for large value benchmark.");
+
+DEFINE_bool(
+ blob_db_enable_gc,
+ ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
+ "Enable BlobDB garbage collection.");
+
+DEFINE_double(
+ blob_db_gc_cutoff,
+ ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
+ "Cutoff ratio for BlobDB garbage collection.");
+
+DEFINE_bool(blob_db_is_fifo,
+ ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo,
+ "Enable FIFO eviction strategy in BlobDB.");
+
+DEFINE_uint64(blob_db_max_db_size,
+ ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
+ "Max size limit of the directory where blob files are stored.");
+
+DEFINE_uint64(
+ blob_db_max_ttl_range, 0,
+ "TTL range to generate BlobDB data (in seconds). 0 means no TTL.");
+
+DEFINE_uint64(blob_db_ttl_range_secs,
+ ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
+ "TTL bucket size to use when creating blob files.");
+
+DEFINE_uint64(blob_db_min_blob_size,
+ ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
+ "Smallest blob to store in a file. Blobs smaller than this "
+ "will be inlined with the key in the LSM tree.");
+
+DEFINE_uint64(blob_db_bytes_per_sync,
+ ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
+ "Bytes to sync blob file at.");
+
+DEFINE_uint64(blob_db_file_size,
+ ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
+ "Target size of each blob file.");
+
+DEFINE_string(blob_db_compression_type, "snappy",
+ "Algorithm to use to compress blob in blob file");
+static enum ROCKSDB_NAMESPACE::CompressionType
+ FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression;
+
+// Secondary DB instance Options
+DEFINE_bool(use_secondary_db, false,
+ "Open a RocksDB secondary instance. A primary instance can be "
+ "running in another db_bench process.");
+
+DEFINE_string(secondary_path, "",
+ "Path to a directory used by the secondary instance to store "
+ "private files, e.g. info log.");
+
+DEFINE_int32(secondary_update_interval, 5,
+ "Secondary instance attempts to catch up with the primary every "
+ "secondary_update_interval seconds.");
+
+#endif // ROCKSDB_LITE
+
+DEFINE_bool(report_bg_io_stats, false,
+ "Measure times spents on I/Os while in compactions. ");
+
+DEFINE_bool(use_stderr_info_logger, false,
+ "Write info logs to stderr instead of to LOG file. ");
+
+DEFINE_string(trace_file, "", "Trace workload to a file. ");
+
+DEFINE_int32(trace_replay_fast_forward, 1,
+ "Fast forward trace replay, must >= 1. ");
+DEFINE_int32(block_cache_trace_sampling_frequency, 1,
+ "Block cache trace sampling frequency, termed s. It uses spatial "
+ "downsampling and samples accesses to one out of s blocks.");
+DEFINE_int64(
+ block_cache_trace_max_trace_file_size_in_bytes,
+ uint64_t{64} * 1024 * 1024 * 1024,
+ "The maximum block cache trace file size in bytes. Block cache accesses "
+ "will not be logged if the trace file size exceeds this threshold. Default "
+ "is 64 GB.");
+DEFINE_string(block_cache_trace_file, "", "Block cache trace file path.");
+DEFINE_int32(trace_replay_threads, 1,
+ "The number of threads to replay, must >=1.");
+
+static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
+ const char* ctype) {
+ assert(ctype);
+
+ if (!strcasecmp(ctype, "none"))
+ return ROCKSDB_NAMESPACE::kNoCompression;
+ else if (!strcasecmp(ctype, "snappy"))
+ return ROCKSDB_NAMESPACE::kSnappyCompression;
+ else if (!strcasecmp(ctype, "zlib"))
+ return ROCKSDB_NAMESPACE::kZlibCompression;
+ else if (!strcasecmp(ctype, "bzip2"))
+ return ROCKSDB_NAMESPACE::kBZip2Compression;
+ else if (!strcasecmp(ctype, "lz4"))
+ return ROCKSDB_NAMESPACE::kLZ4Compression;
+ else if (!strcasecmp(ctype, "lz4hc"))
+ return ROCKSDB_NAMESPACE::kLZ4HCCompression;
+ else if (!strcasecmp(ctype, "xpress"))
+ return ROCKSDB_NAMESPACE::kXpressCompression;
+ else if (!strcasecmp(ctype, "zstd"))
+ return ROCKSDB_NAMESPACE::kZSTD;
+
+ fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
+ return ROCKSDB_NAMESPACE::kSnappyCompression; // default value
+}
+
+static std::string ColumnFamilyName(size_t i) {
+ if (i == 0) {
+ return ROCKSDB_NAMESPACE::kDefaultColumnFamilyName;
+ } else {
+ char name[100];
+ snprintf(name, sizeof(name), "column_family_name_%06zu", i);
+ return std::string(name);
+ }
+}
+
+DEFINE_string(compression_type, "snappy",
+ "Algorithm to use to compress the database");
+static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_compression_type_e =
+ ROCKSDB_NAMESPACE::kSnappyCompression;
+
+DEFINE_int64(sample_for_compression, 0, "Sample every N block for compression");
+
+DEFINE_int32(compression_level, ROCKSDB_NAMESPACE::CompressionOptions().level,
+ "Compression level. The meaning of this value is library-"
+ "dependent. If unset, we try to use the default for the library "
+ "specified in `--compression_type`");
+
+DEFINE_int32(compression_max_dict_bytes,
+ ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes,
+ "Maximum size of dictionary used to prime the compression "
+ "library.");
+
+DEFINE_int32(compression_zstd_max_train_bytes,
+ ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes,
+ "Maximum size of training data passed to zstd's dictionary "
+ "trainer.");
+
+DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
+ " from this level. Levels with number < min_level_to_compress are"
+ " not compressed. Otherwise, apply compression_type to "
+ "all levels.");
+
+static bool ValidateTableCacheNumshardbits(const char* flagname,
+ int32_t value) {
+ if (0 >= value || value > 20) {
+ fprintf(stderr, "Invalid value for --%s: %d, must be 0 < val <= 20\n",
+ flagname, value);
+ return false;
+ }
+ return true;
+}
+DEFINE_int32(table_cache_numshardbits, 4, "");
+
+#ifndef ROCKSDB_LITE
+DEFINE_string(env_uri, "", "URI for registry Env lookup. Mutually exclusive"
+ " with --hdfs.");
+#endif // ROCKSDB_LITE
+DEFINE_string(hdfs, "", "Name of hdfs environment. Mutually exclusive with"
+ " --env_uri.");
+
+static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
+
+static ROCKSDB_NAMESPACE::Env* FLAGS_env = ROCKSDB_NAMESPACE::Env::Default();
+
+DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
+ "this is greater than zero. When 0 the interval grows over time.");
+
+DEFINE_int64(stats_interval_seconds, 0, "Report stats every N seconds. This "
+ "overrides stats_interval when both are > 0.");
+
+DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
+ " this is greater than 0.");
+
+DEFINE_int64(report_interval_seconds, 0,
+ "If greater than zero, it will write simple stats in CVS format "
+ "to --report_file every N seconds");
+
+DEFINE_string(report_file, "report.csv",
+ "Filename where some simple stats are reported to (if "
+ "--report_interval_seconds is bigger than 0)");
+
+DEFINE_int32(thread_status_per_interval, 0,
+ "Takes and report a snapshot of the current status of each thread"
+ " when this is greater than 0.");
+
+DEFINE_int32(perf_level, ROCKSDB_NAMESPACE::PerfLevel::kDisable,
+ "Level of perf collection");
+
+static bool ValidateRateLimit(const char* flagname, double value) {
+ const double EPSILON = 1e-10;
+ if ( value < -EPSILON ) {
+ fprintf(stderr, "Invalid value for --%s: %12.6f, must be >= 0.0\n",
+ flagname, value);
+ return false;
+ }
+ return true;
+}
+DEFINE_double(soft_rate_limit, 0.0, "DEPRECATED");
+
+DEFINE_double(hard_rate_limit, 0.0, "DEPRECATED");
+
+DEFINE_uint64(soft_pending_compaction_bytes_limit, 64ull * 1024 * 1024 * 1024,
+ "Slowdown writes if pending compaction bytes exceed this number");
+
+DEFINE_uint64(hard_pending_compaction_bytes_limit, 128ull * 1024 * 1024 * 1024,
+ "Stop writes if pending compaction bytes exceed this number");
+
+DEFINE_uint64(delayed_write_rate, 8388608u,
+ "Limited bytes allowed to DB when soft_rate_limit or "
+ "level0_slowdown_writes_trigger triggers");
+
+DEFINE_bool(enable_pipelined_write, true,
+ "Allow WAL and memtable writes to be pipelined");
+
+DEFINE_bool(unordered_write, false,
+ "Allow WAL and memtable writes to be pipelined");
+
+DEFINE_bool(allow_concurrent_memtable_write, true,
+ "Allow multi-writers to update mem tables in parallel.");
+
+DEFINE_bool(inplace_update_support,
+ ROCKSDB_NAMESPACE::Options().inplace_update_support,
+ "Support in-place memtable update for smaller or same-size values");
+
+DEFINE_uint64(inplace_update_num_locks,
+ ROCKSDB_NAMESPACE::Options().inplace_update_num_locks,
+ "Number of RW locks to protect in-place memtable updates");
+
+DEFINE_bool(enable_write_thread_adaptive_yield, true,
+ "Use a yielding spin loop for brief writer thread waits.");
+
+DEFINE_uint64(
+ write_thread_max_yield_usec, 100,
+ "Maximum microseconds for enable_write_thread_adaptive_yield operation.");
+
+DEFINE_uint64(write_thread_slow_yield_usec, 3,
+ "The threshold at which a slow yield is considered a signal that "
+ "other processes or threads want the core.");
+
+DEFINE_int32(rate_limit_delay_max_milliseconds, 1000,
+ "When hard_rate_limit is set then this is the max time a put will"
+ " be stalled.");
+
+DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
+
+DEFINE_bool(rate_limiter_auto_tuned, false,
+ "Enable dynamic adjustment of rate limit according to demand for "
+ "background I/O");
+
+
+DEFINE_bool(sine_write_rate, false,
+ "Use a sine wave write_rate_limit");
+
+DEFINE_uint64(sine_write_rate_interval_milliseconds, 10000,
+ "Interval of which the sine wave write_rate_limit is recalculated");
+
+DEFINE_double(sine_a, 1,
+ "A in f(x) = A sin(bx + c) + d");
+
+DEFINE_double(sine_b, 1,
+ "B in f(x) = A sin(bx + c) + d");
+
+DEFINE_double(sine_c, 0,
+ "C in f(x) = A sin(bx + c) + d");
+
+DEFINE_double(sine_d, 1,
+ "D in f(x) = A sin(bx + c) + d");
+
+DEFINE_bool(rate_limit_bg_reads, false,
+ "Use options.rate_limiter on compaction reads");
+
+DEFINE_uint64(
+ benchmark_write_rate_limit, 0,
+ "If non-zero, db_bench will rate-limit the writes going into RocksDB. This "
+ "is the global rate in bytes/second.");
+
+// the parameters of mix_graph
+DEFINE_double(keyrange_dist_a, 0.0,
+ "The parameter 'a' of prefix average access distribution "
+ "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_double(keyrange_dist_b, 0.0,
+ "The parameter 'b' of prefix average access distribution "
+ "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_double(keyrange_dist_c, 0.0,
+ "The parameter 'c' of prefix average access distribution"
+ "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_double(keyrange_dist_d, 0.0,
+ "The parameter 'd' of prefix average access distribution"
+ "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_int64(keyrange_num, 1,
+ "The number of key ranges that are in the same prefix "
+ "group, each prefix range will have its key acccess "
+ "distribution");
+DEFINE_double(key_dist_a, 0.0,
+ "The parameter 'a' of key access distribution model "
+ "f(x)=a*x^b");
+DEFINE_double(key_dist_b, 0.0,
+ "The parameter 'b' of key access distribution model "
+ "f(x)=a*x^b");
+DEFINE_double(value_theta, 0.0,
+ "The parameter 'theta' of Generized Pareto Distribution "
+ "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(value_k, 0.0,
+ "The parameter 'k' of Generized Pareto Distribution "
+ "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(value_sigma, 0.0,
+ "The parameter 'theta' of Generized Pareto Distribution "
+ "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(iter_theta, 0.0,
+ "The parameter 'theta' of Generized Pareto Distribution "
+ "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(iter_k, 0.0,
+ "The parameter 'k' of Generized Pareto Distribution "
+ "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(iter_sigma, 0.0,
+ "The parameter 'sigma' of Generized Pareto Distribution "
+ "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(mix_get_ratio, 1.0,
+ "The ratio of Get queries of mix_graph workload");
+DEFINE_double(mix_put_ratio, 0.0,
+ "The ratio of Put queries of mix_graph workload");
+DEFINE_double(mix_seek_ratio, 0.0,
+ "The ratio of Seek queries of mix_graph workload");
+DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator");
+DEFINE_int64(mix_ave_kv_size, 512,
+ "The average key-value size of this workload");
+DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload");
+DEFINE_double(
+ sine_mix_rate_noise, 0.0,
+ "Add the noise ratio to the sine rate, it is between 0.0 and 1.0");
+DEFINE_bool(sine_mix_rate, false,
+ "Enable the sine QPS control on the mix workload");
+DEFINE_uint64(
+ sine_mix_rate_interval_milliseconds, 10000,
+ "Interval of which the sine wave read_rate_limit is recalculated");
+DEFINE_int64(mix_accesses, -1,
+ "The total query accesses of mix_graph workload");
+
+DEFINE_uint64(
+ benchmark_read_rate_limit, 0,
+ "If non-zero, db_bench will rate-limit the reads from RocksDB. This "
+ "is the global rate in ops/second.");
+
+DEFINE_uint64(max_compaction_bytes,
+ ROCKSDB_NAMESPACE::Options().max_compaction_bytes,
+ "Max bytes allowed in one compaction");
+
+#ifndef ROCKSDB_LITE
+DEFINE_bool(readonly, false, "Run read only benchmarks.");
+
+DEFINE_bool(print_malloc_stats, false,
+ "Print malloc stats to stdout after benchmarks finish.");
+#endif // ROCKSDB_LITE
+
+DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
+
+DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
+DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files"
+ " in MB.");
+DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
+
+DEFINE_bool(mmap_read, ROCKSDB_NAMESPACE::Options().allow_mmap_reads,
+ "Allow reads to occur via mmap-ing files");
+
+DEFINE_bool(mmap_write, ROCKSDB_NAMESPACE::Options().allow_mmap_writes,
+ "Allow writes to occur via mmap-ing files");
+
+DEFINE_bool(use_direct_reads, ROCKSDB_NAMESPACE::Options().use_direct_reads,
+ "Use O_DIRECT for reading data");
+
+DEFINE_bool(use_direct_io_for_flush_and_compaction,
+ ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction,
+ "Use O_DIRECT for background flush and compaction writes");
+
+DEFINE_bool(advise_random_on_open,
+ ROCKSDB_NAMESPACE::Options().advise_random_on_open,
+ "Advise random access on table file open");
+
+DEFINE_string(compaction_fadvice, "NORMAL",
+ "Access pattern advice when a file is compacted");
+static auto FLAGS_compaction_fadvice_e =
+ ROCKSDB_NAMESPACE::Options().access_hint_on_compaction_start;
+
+DEFINE_bool(use_tailing_iterator, false,
+ "Use tailing iterator to access a series of keys instead of get");
+
+DEFINE_bool(use_adaptive_mutex, ROCKSDB_NAMESPACE::Options().use_adaptive_mutex,
+ "Use adaptive mutex");
+
+DEFINE_uint64(bytes_per_sync, ROCKSDB_NAMESPACE::Options().bytes_per_sync,
+ "Allows OS to incrementally sync SST files to disk while they are"
+ " being written, in the background. Issue one request for every"
+ " bytes_per_sync written. 0 turns it off.");
+
+DEFINE_uint64(wal_bytes_per_sync,
+ ROCKSDB_NAMESPACE::Options().wal_bytes_per_sync,
+ "Allows OS to incrementally sync WAL files to disk while they are"
+ " being written, in the background. Issue one request for every"
+ " wal_bytes_per_sync written. 0 turns it off.");
+
+DEFINE_bool(use_single_deletes, true,
+ "Use single deletes (used in RandomReplaceKeys only).");
+
+DEFINE_double(stddev, 2000.0,
+ "Standard deviation of normal distribution used for picking keys"
+ " (used in RandomReplaceKeys only).");
+
+DEFINE_int32(key_id_range, 100000,
+ "Range of possible value of key id (used in TimeSeries only).");
+
+DEFINE_string(expire_style, "none",
+ "Style to remove expired time entries. Can be one of the options "
+ "below: none (do not expired data), compaction_filter (use a "
+ "compaction filter to remove expired data), delete (seek IDs and "
+ "remove expired data) (used in TimeSeries only).");
+
+DEFINE_uint64(
+ time_range, 100000,
+ "Range of timestamp that store in the database (used in TimeSeries"
+ " only).");
+
+DEFINE_int32(num_deletion_threads, 1,
+ "Number of threads to do deletion (used in TimeSeries and delete "
+ "expire_style only).");
+
+DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge"
+ " operations on a key in the memtable");
+
+static bool ValidatePrefixSize(const char* flagname, int32_t value) {
+ if (value < 0 || value>=2000000000) {
+ fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
+ flagname, value);
+ return false;
+ }
+ return true;
+}
+
+DEFINE_int32(prefix_size, 0, "control the prefix size for HashSkipList and "
+ "plain table");
+DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated "
+ "per prefix, 0 means no special handling of the prefix, "
+ "i.e. use the prefix comes with the generated random number.");
+DEFINE_bool(total_order_seek, false,
+ "Enable total order seek regardless of index format.");
+DEFINE_bool(prefix_same_as_start, false,
+ "Enforce iterator to return keys with prefix same as seek key.");
+DEFINE_bool(
+ seek_missing_prefix, false,
+ "Iterator seek to keys with non-exist prefixes. Require prefix_size > 8");
+
+DEFINE_int32(memtable_insert_with_hint_prefix_size, 0,
+ "If non-zero, enable "
+ "memtable insert with hint with the given prefix size.");
+DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction "
+ "threads' IO priority");
+DEFINE_bool(enable_cpu_prio, false, "Lower the background flush/compaction "
+ "threads' CPU priority");
+DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo "
+ "table becomes an identity function. This is only valid when key "
+ "is 8 bytes");
+DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG ");
+DEFINE_uint64(stats_dump_period_sec,
+ ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
+ "Gap between printing stats to log in seconds");
+DEFINE_uint64(stats_persist_period_sec,
+ ROCKSDB_NAMESPACE::Options().stats_persist_period_sec,
+ "Gap between persisting stats in seconds");
+DEFINE_bool(persist_stats_to_disk,
+ ROCKSDB_NAMESPACE::Options().persist_stats_to_disk,
+ "whether to persist stats to disk");
+DEFINE_uint64(stats_history_buffer_size,
+ ROCKSDB_NAMESPACE::Options().stats_history_buffer_size,
+ "Max number of stats snapshots to keep in memory");
+DEFINE_int64(multiread_stride, 0,
+ "Stride length for the keys in a MultiGet batch");
+DEFINE_bool(multiread_batched, false, "Use the new MultiGet API");
+
+enum RepFactory {
+ kSkipList,
+ kPrefixHash,
+ kVectorRep,
+ kHashLinkedList,
+};
+
+static enum RepFactory StringToRepFactory(const char* ctype) {
+ assert(ctype);
+
+ if (!strcasecmp(ctype, "skip_list"))
+ return kSkipList;
+ else if (!strcasecmp(ctype, "prefix_hash"))
+ return kPrefixHash;
+ else if (!strcasecmp(ctype, "vector"))
+ return kVectorRep;
+ else if (!strcasecmp(ctype, "hash_linkedlist"))
+ return kHashLinkedList;
+
+ fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
+ return kSkipList;
+}
+
+static enum RepFactory FLAGS_rep_factory;
+DEFINE_string(memtablerep, "skip_list", "");
+DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
+DEFINE_bool(use_plain_table, false, "if use plain table "
+ "instead of block-based table format");
+DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format");
+DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
+DEFINE_bool(use_hash_search, false, "if use kHashSearch "
+ "instead of kBinarySearch. "
+ "This is valid if only we use BlockTable");
+DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter "
+ "instead of kFullFilter for filter block. "
+ "This is valid if only we use BlockTable");
+DEFINE_string(merge_operator, "", "The merge operator to use with the database."
+ "If a new merge operator is specified, be sure to use fresh"
+ " database The possible merge operators are defined in"
+ " utilities/merge_operators.h");
+DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try "
+ "linear search first for this many steps from the previous "
+ "position");
+DEFINE_bool(report_file_operations, false, "if report number of file "
+ "operations");
+DEFINE_int32(readahead_size, 0, "Iterator readahead size");
+
+static const bool FLAGS_soft_rate_limit_dummy __attribute__((__unused__)) =
+ RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit);
+
+static const bool FLAGS_hard_rate_limit_dummy __attribute__((__unused__)) =
+ RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit);
+
+static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
+ RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
+
+static const bool FLAGS_key_size_dummy __attribute__((__unused__)) =
+ RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
+
+static const bool FLAGS_cache_numshardbits_dummy __attribute__((__unused__)) =
+ RegisterFlagValidator(&FLAGS_cache_numshardbits,
+ &ValidateCacheNumshardbits);
+
+static const bool FLAGS_readwritepercent_dummy __attribute__((__unused__)) =
+ RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
+
+DEFINE_int32(disable_seek_compaction, false,
+ "Not used, left here for backwards compatibility");
+
+static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) =
+ RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
+static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((__unused__)) =
+ RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
+ &ValidateTableCacheNumshardbits);
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+struct ReportFileOpCounters {
+ std::atomic<int> open_counter_;
+ std::atomic<int> read_counter_;
+ std::atomic<int> append_counter_;
+ std::atomic<uint64_t> bytes_read_;
+ std::atomic<uint64_t> bytes_written_;
+};
+
+// A special Env to records and report file operations in db_bench
+class ReportFileOpEnv : public EnvWrapper {
+ public:
+ explicit ReportFileOpEnv(Env* base) : EnvWrapper(base) { reset(); }
+
+ void reset() {
+ counters_.open_counter_ = 0;
+ counters_.read_counter_ = 0;
+ counters_.append_counter_ = 0;
+ counters_.bytes_read_ = 0;
+ counters_.bytes_written_ = 0;
+ }
+
+ Status NewSequentialFile(const std::string& f,
+ std::unique_ptr<SequentialFile>* r,
+ const EnvOptions& soptions) override {
+ class CountingFile : public SequentialFile {
+ private:
+ std::unique_ptr<SequentialFile> target_;
+ ReportFileOpCounters* counters_;
+
+ public:
+ CountingFile(std::unique_ptr<SequentialFile>&& target,
+ ReportFileOpCounters* counters)
+ : target_(std::move(target)), counters_(counters) {}
+
+ Status Read(size_t n, Slice* result, char* scratch) override {
+ counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
+ Status rv = target_->Read(n, result, scratch);
+ counters_->bytes_read_.fetch_add(result->size(),
+ std::memory_order_relaxed);
+ return rv;
+ }
+
+ Status Skip(uint64_t n) override { return target_->Skip(n); }
+ };
+
+ Status s = target()->NewSequentialFile(f, r, soptions);
+ if (s.ok()) {
+ counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
+ r->reset(new CountingFile(std::move(*r), counters()));
+ }
+ return s;
+ }
+
+ Status NewRandomAccessFile(const std::string& f,
+ std::unique_ptr<RandomAccessFile>* r,
+ const EnvOptions& soptions) override {
+ class CountingFile : public RandomAccessFile {
+ private:
+ std::unique_ptr<RandomAccessFile> target_;
+ ReportFileOpCounters* counters_;
+
+ public:
+ CountingFile(std::unique_ptr<RandomAccessFile>&& target,
+ ReportFileOpCounters* counters)
+ : target_(std::move(target)), counters_(counters) {}
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
+ Status rv = target_->Read(offset, n, result, scratch);
+ counters_->bytes_read_.fetch_add(result->size(),
+ std::memory_order_relaxed);
+ return rv;
+ }
+ };
+
+ Status s = target()->NewRandomAccessFile(f, r, soptions);
+ if (s.ok()) {
+ counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
+ r->reset(new CountingFile(std::move(*r), counters()));
+ }
+ return s;
+ }
+
+ Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+ const EnvOptions& soptions) override {
+ class CountingFile : public WritableFile {
+ private:
+ std::unique_ptr<WritableFile> target_;
+ ReportFileOpCounters* counters_;
+
+ public:
+ CountingFile(std::unique_ptr<WritableFile>&& target,
+ ReportFileOpCounters* counters)
+ : target_(std::move(target)), counters_(counters) {}
+
+ Status Append(const Slice& data) override {
+ counters_->append_counter_.fetch_add(1, std::memory_order_relaxed);
+ Status rv = target_->Append(data);
+ counters_->bytes_written_.fetch_add(data.size(),
+ std::memory_order_relaxed);
+ return rv;
+ }
+
+ Status Truncate(uint64_t size) override { return target_->Truncate(size); }
+ Status Close() override { return target_->Close(); }
+ Status Flush() override { return target_->Flush(); }
+ Status Sync() override { return target_->Sync(); }
+ };
+
+ Status s = target()->NewWritableFile(f, r, soptions);
+ if (s.ok()) {
+ counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
+ r->reset(new CountingFile(std::move(*r), counters()));
+ }
+ return s;
+ }
+
+ // getter
+ ReportFileOpCounters* counters() { return &counters_; }
+
+ private:
+ ReportFileOpCounters counters_;
+};
+
+} // namespace
+
+enum DistributionType : unsigned char {
+ kFixed = 0,
+ kUniform,
+ kNormal
+};
+
+static enum DistributionType FLAGS_value_size_distribution_type_e = kFixed;
+
+static enum DistributionType StringToDistributionType(const char* ctype) {
+ assert(ctype);
+
+ if (!strcasecmp(ctype, "fixed"))
+ return kFixed;
+ else if (!strcasecmp(ctype, "uniform"))
+ return kUniform;
+ else if (!strcasecmp(ctype, "normal"))
+ return kNormal;
+
+ fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype);
+ return kFixed; // default value
+}
+
+class BaseDistribution {
+ public:
+ BaseDistribution(unsigned int min, unsigned int max) :
+ min_value_size_(min),
+ max_value_size_(max) {}
+ virtual ~BaseDistribution() {}
+
+ unsigned int Generate() {
+ auto val = Get();
+ if (NeedTruncate()) {
+ val = std::max(min_value_size_, val);
+ val = std::min(max_value_size_, val);
+ }
+ return val;
+ }
+ private:
+ virtual unsigned int Get() = 0;
+ virtual bool NeedTruncate() {
+ return true;
+ }
+ unsigned int min_value_size_;
+ unsigned int max_value_size_;
+};
+
+class FixedDistribution : public BaseDistribution
+{
+ public:
+ FixedDistribution(unsigned int size) :
+ BaseDistribution(size, size),
+ size_(size) {}
+ private:
+ virtual unsigned int Get() override {
+ return size_;
+ }
+ virtual bool NeedTruncate() override {
+ return false;
+ }
+ unsigned int size_;
+};
+
+class NormalDistribution
+ : public BaseDistribution, public std::normal_distribution<double> {
+ public:
+ NormalDistribution(unsigned int min, unsigned int max) :
+ BaseDistribution(min, max),
+ // 99.7% values within the range [min, max].
+ std::normal_distribution<double>((double)(min + max) / 2.0 /*mean*/,
+ (double)(max - min) / 6.0 /*stddev*/),
+ gen_(rd_()) {}
+ private:
+ virtual unsigned int Get() override {
+ return static_cast<unsigned int>((*this)(gen_));
+ }
+ std::random_device rd_;
+ std::mt19937 gen_;
+};
+
+class UniformDistribution
+ : public BaseDistribution,
+ public std::uniform_int_distribution<unsigned int> {
+ public:
+ UniformDistribution(unsigned int min, unsigned int max) :
+ BaseDistribution(min, max),
+ std::uniform_int_distribution<unsigned int>(min, max),
+ gen_(rd_()) {}
+ private:
+ virtual unsigned int Get() override {
+ return (*this)(gen_);
+ }
+ virtual bool NeedTruncate() override {
+ return false;
+ }
+ std::random_device rd_;
+ std::mt19937 gen_;
+};
+
+// Helper for quickly generating random data.
+class RandomGenerator {
+ private:
+ std::string data_;
+ unsigned int pos_;
+ std::unique_ptr<BaseDistribution> dist_;
+
+ public:
+
+ RandomGenerator() {
+ auto max_value_size = FLAGS_value_size_max;
+ switch (FLAGS_value_size_distribution_type_e) {
+ case kUniform:
+ dist_.reset(new UniformDistribution(FLAGS_value_size_min,
+ FLAGS_value_size_max));
+ break;
+ case kNormal:
+ dist_.reset(new NormalDistribution(FLAGS_value_size_min,
+ FLAGS_value_size_max));
+ break;
+ case kFixed:
+ default:
+ dist_.reset(new FixedDistribution(value_size));
+ max_value_size = value_size;
+ }
+ // We use a limited amount of data over and over again and ensure
+ // that it is larger than the compression window (32KB), and also
+ // large enough to serve all typical value sizes we want to write.
+ Random rnd(301);
+ std::string piece;
+ while (data_.size() < (unsigned)std::max(1048576, max_value_size)) {
+ // Add a short fragment that is as compressible as specified
+ // by FLAGS_compression_ratio.
+ test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
+ data_.append(piece);
+ }
+ pos_ = 0;
+ }
+
+ Slice Generate(unsigned int len) {
+ assert(len <= data_.size());
+ if (pos_ + len > data_.size()) {
+ pos_ = 0;
+ }
+ pos_ += len;
+ return Slice(data_.data() + pos_ - len, len);
+ }
+
+ Slice Generate() {
+ auto len = dist_->Generate();
+ return Generate(len);
+ }
+};
+
+static void AppendWithSpace(std::string* str, Slice msg) {
+ if (msg.empty()) return;
+ if (!str->empty()) {
+ str->push_back(' ');
+ }
+ str->append(msg.data(), msg.size());
+}
+
+struct DBWithColumnFamilies {
+ std::vector<ColumnFamilyHandle*> cfh;
+ DB* db;
+#ifndef ROCKSDB_LITE
+ OptimisticTransactionDB* opt_txn_db;
+#endif // ROCKSDB_LITE
+ std::atomic<size_t> num_created; // Need to be updated after all the
+ // new entries in cfh are set.
+ size_t num_hot; // Number of column families to be queried at each moment.
+ // After each CreateNewCf(), another num_hot number of new
+ // Column families will be created and used to be queried.
+ port::Mutex create_cf_mutex; // Only one thread can execute CreateNewCf()
+ std::vector<int> cfh_idx_to_prob; // ith index holds probability of operating
+ // on cfh[i].
+
+ DBWithColumnFamilies()
+ : db(nullptr)
+#ifndef ROCKSDB_LITE
+ , opt_txn_db(nullptr)
+#endif // ROCKSDB_LITE
+ {
+ cfh.clear();
+ num_created = 0;
+ num_hot = 0;
+ }
+
+ DBWithColumnFamilies(const DBWithColumnFamilies& other)
+ : cfh(other.cfh),
+ db(other.db),
+#ifndef ROCKSDB_LITE
+ opt_txn_db(other.opt_txn_db),
+#endif // ROCKSDB_LITE
+ num_created(other.num_created.load()),
+ num_hot(other.num_hot),
+ cfh_idx_to_prob(other.cfh_idx_to_prob) {
+ }
+
+ void DeleteDBs() {
+ std::for_each(cfh.begin(), cfh.end(),
+ [](ColumnFamilyHandle* cfhi) { delete cfhi; });
+ cfh.clear();
+#ifndef ROCKSDB_LITE
+ if (opt_txn_db) {
+ delete opt_txn_db;
+ opt_txn_db = nullptr;
+ } else {
+ delete db;
+ db = nullptr;
+ }
+#else
+ delete db;
+ db = nullptr;
+#endif // ROCKSDB_LITE
+ }
+
+ ColumnFamilyHandle* GetCfh(int64_t rand_num) {
+ assert(num_hot > 0);
+ size_t rand_offset = 0;
+ if (!cfh_idx_to_prob.empty()) {
+ assert(cfh_idx_to_prob.size() == num_hot);
+ int sum = 0;
+ while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) {
+ sum += cfh_idx_to_prob[rand_offset];
+ ++rand_offset;
+ }
+ assert(rand_offset < cfh_idx_to_prob.size());
+ } else {
+ rand_offset = rand_num % num_hot;
+ }
+ return cfh[num_created.load(std::memory_order_acquire) - num_hot +
+ rand_offset];
+ }
+
+ // stage: assume CF from 0 to stage * num_hot has be created. Need to create
+ // stage * num_hot + 1 to stage * (num_hot + 1).
+ void CreateNewCf(ColumnFamilyOptions options, int64_t stage) {
+ MutexLock l(&create_cf_mutex);
+ if ((stage + 1) * num_hot <= num_created) {
+ // Already created.
+ return;
+ }
+ auto new_num_created = num_created + num_hot;
+ assert(new_num_created <= cfh.size());
+ for (size_t i = num_created; i < new_num_created; i++) {
+ Status s =
+ db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i]));
+ if (!s.ok()) {
+ fprintf(stderr, "create column family error: %s\n",
+ s.ToString().c_str());
+ abort();
+ }
+ }
+ num_created.store(new_num_created, std::memory_order_release);
+ }
+};
+
+// a class that reports stats to CSV file
+class ReporterAgent {
+ public:
+ ReporterAgent(Env* env, const std::string& fname,
+ uint64_t report_interval_secs)
+ : env_(env),
+ total_ops_done_(0),
+ last_report_(0),
+ report_interval_secs_(report_interval_secs),
+ stop_(false) {
+ auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions());
+ if (s.ok()) {
+ s = report_file_->Append(Header() + "\n");
+ }
+ if (s.ok()) {
+ s = report_file_->Flush();
+ }
+ if (!s.ok()) {
+ fprintf(stderr, "Can't open %s: %s\n", fname.c_str(),
+ s.ToString().c_str());
+ abort();
+ }
+
+ reporting_thread_ = port::Thread([&]() { SleepAndReport(); });
+ }
+
+ ~ReporterAgent() {
+ {
+ std::unique_lock<std::mutex> lk(mutex_);
+ stop_ = true;
+ stop_cv_.notify_all();
+ }
+ reporting_thread_.join();
+ }
+
+ // thread safe
+ void ReportFinishedOps(int64_t num_ops) {
+ total_ops_done_.fetch_add(num_ops);
+ }
+
+ private:
+ std::string Header() const { return "secs_elapsed,interval_qps"; }
+ void SleepAndReport() {
+ auto time_started = env_->NowMicros();
+ while (true) {
+ {
+ std::unique_lock<std::mutex> lk(mutex_);
+ if (stop_ ||
+ stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_),
+ [&]() { return stop_; })) {
+ // stopping
+ break;
+ }
+ // else -> timeout, which means time for a report!
+ }
+ auto total_ops_done_snapshot = total_ops_done_.load();
+ // round the seconds elapsed
+ auto secs_elapsed =
+ (env_->NowMicros() - time_started + kMicrosInSecond / 2) /
+ kMicrosInSecond;
+ std::string report = ToString(secs_elapsed) + "," +
+ ToString(total_ops_done_snapshot - last_report_) +
+ "\n";
+ auto s = report_file_->Append(report);
+ if (s.ok()) {
+ s = report_file_->Flush();
+ }
+ if (!s.ok()) {
+ fprintf(stderr,
+ "Can't write to report file (%s), stopping the reporting\n",
+ s.ToString().c_str());
+ break;
+ }
+ last_report_ = total_ops_done_snapshot;
+ }
+ }
+
+ Env* env_;
+ std::unique_ptr<WritableFile> report_file_;
+ std::atomic<int64_t> total_ops_done_;
+ int64_t last_report_;
+ const uint64_t report_interval_secs_;
+ ROCKSDB_NAMESPACE::port::Thread reporting_thread_;
+ std::mutex mutex_;
+ // will notify on stop
+ std::condition_variable stop_cv_;
+ bool stop_;
+};
+
+enum OperationType : unsigned char {
+ kRead = 0,
+ kWrite,
+ kDelete,
+ kSeek,
+ kMerge,
+ kUpdate,
+ kCompress,
+ kUncompress,
+ kCrc,
+ kHash,
+ kOthers
+};
+
+static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
+ OperationTypeString = {
+ {kRead, "read"},
+ {kWrite, "write"},
+ {kDelete, "delete"},
+ {kSeek, "seek"},
+ {kMerge, "merge"},
+ {kUpdate, "update"},
+ {kCompress, "compress"},
+ {kCompress, "uncompress"},
+ {kCrc, "crc"},
+ {kHash, "hash"},
+ {kOthers, "op"}
+};
+
+class CombinedStats;
+class Stats {
+ private:
+ int id_;
+ uint64_t start_;
+ uint64_t sine_interval_;
+ uint64_t finish_;
+ double seconds_;
+ uint64_t done_;
+ uint64_t last_report_done_;
+ uint64_t next_report_;
+ uint64_t bytes_;
+ uint64_t last_op_finish_;
+ uint64_t last_report_finish_;
+ std::unordered_map<OperationType, std::shared_ptr<HistogramImpl>,
+ std::hash<unsigned char>> hist_;
+ std::string message_;
+ bool exclude_from_merge_;
+ ReporterAgent* reporter_agent_; // does not own
+ friend class CombinedStats;
+
+ public:
+ Stats() { Start(-1); }
+
+ void SetReporterAgent(ReporterAgent* reporter_agent) {
+ reporter_agent_ = reporter_agent;
+ }
+
+ void Start(int id) {
+ id_ = id;
+ next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
+ last_op_finish_ = start_;
+ hist_.clear();
+ done_ = 0;
+ last_report_done_ = 0;
+ bytes_ = 0;
+ seconds_ = 0;
+ start_ = FLAGS_env->NowMicros();
+ sine_interval_ = FLAGS_env->NowMicros();
+ finish_ = start_;
+ last_report_finish_ = start_;
+ message_.clear();
+ // When set, stats from this thread won't be merged with others.
+ exclude_from_merge_ = false;
+ }
+
+ void Merge(const Stats& other) {
+ if (other.exclude_from_merge_)
+ return;
+
+ for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) {
+ auto this_it = hist_.find(it->first);
+ if (this_it != hist_.end()) {
+ this_it->second->Merge(*(other.hist_.at(it->first)));
+ } else {
+ hist_.insert({ it->first, it->second });
+ }
+ }
+
+ done_ += other.done_;
+ bytes_ += other.bytes_;
+ seconds_ += other.seconds_;
+ if (other.start_ < start_) start_ = other.start_;
+ if (other.finish_ > finish_) finish_ = other.finish_;
+
+ // Just keep the messages from one thread
+ if (message_.empty()) message_ = other.message_;
+ }
+
+ void Stop() {
+ finish_ = FLAGS_env->NowMicros();
+ seconds_ = (finish_ - start_) * 1e-6;
+ }
+
+ void AddMessage(Slice msg) {
+ AppendWithSpace(&message_, msg);
+ }
+
+ void SetId(int id) { id_ = id; }
+ void SetExcludeFromMerge() { exclude_from_merge_ = true; }
+
+ void PrintThreadStatus() {
+ std::vector<ThreadStatus> thread_list;
+ FLAGS_env->GetThreadList(&thread_list);
+
+ fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n",
+ "ThreadID", "ThreadType", "cfName", "Operation",
+ "ElapsedTime", "Stage", "State", "OperationProperties");
+
+ int64_t current_time = 0;
+ FLAGS_env->GetCurrentTime(&current_time);
+ for (auto ts : thread_list) {
+ fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
+ ts.thread_id,
+ ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(),
+ ts.cf_name.c_str(),
+ ThreadStatus::GetOperationName(ts.operation_type).c_str(),
+ ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(),
+ ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(),
+ ThreadStatus::GetStateName(ts.state_type).c_str());
+
+ auto op_properties = ThreadStatus::InterpretOperationProperties(
+ ts.operation_type, ts.op_properties);
+ for (const auto& op_prop : op_properties) {
+ fprintf(stderr, " %s %" PRIu64" |",
+ op_prop.first.c_str(), op_prop.second);
+ }
+ fprintf(stderr, "\n");
+ }
+ }
+
+ void ResetSineInterval() {
+ sine_interval_ = FLAGS_env->NowMicros();
+ }
+
+ uint64_t GetSineInterval() {
+ return sine_interval_;
+ }
+
+ uint64_t GetStart() {
+ return start_;
+ }
+
+ void ResetLastOpTime() {
+ // Set to now to avoid latency from calls to SleepForMicroseconds
+ last_op_finish_ = FLAGS_env->NowMicros();
+ }
+
+ void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops,
+ enum OperationType op_type = kOthers) {
+ if (reporter_agent_) {
+ reporter_agent_->ReportFinishedOps(num_ops);
+ }
+ if (FLAGS_histogram) {
+ uint64_t now = FLAGS_env->NowMicros();
+ uint64_t micros = now - last_op_finish_;
+
+ if (hist_.find(op_type) == hist_.end())
+ {
+ auto hist_temp = std::make_shared<HistogramImpl>();
+ hist_.insert({op_type, std::move(hist_temp)});
+ }
+ hist_[op_type]->Add(micros);
+
+ if (micros > 20000 && !FLAGS_stats_interval) {
+ fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, "");
+ fflush(stderr);
+ }
+ last_op_finish_ = now;
+ }
+
+ done_ += num_ops;
+ if (done_ >= next_report_) {
+ if (!FLAGS_stats_interval) {
+ if (next_report_ < 1000) next_report_ += 100;
+ else if (next_report_ < 5000) next_report_ += 500;
+ else if (next_report_ < 10000) next_report_ += 1000;
+ else if (next_report_ < 50000) next_report_ += 5000;
+ else if (next_report_ < 100000) next_report_ += 10000;
+ else if (next_report_ < 500000) next_report_ += 50000;
+ else next_report_ += 100000;
+ fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
+ } else {
+ uint64_t now = FLAGS_env->NowMicros();
+ int64_t usecs_since_last = now - last_report_finish_;
+
+ // Determine whether to print status where interval is either
+ // each N operations or each N seconds.
+
+ if (FLAGS_stats_interval_seconds &&
+ usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) {
+ // Don't check again for this many operations
+ next_report_ += FLAGS_stats_interval;
+
+ } else {
+
+ fprintf(stderr,
+ "%s ... thread %d: (%" PRIu64 ",%" PRIu64 ") ops and "
+ "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
+ FLAGS_env->TimeToString(now/1000000).c_str(),
+ id_,
+ done_ - last_report_done_, done_,
+ (done_ - last_report_done_) /
+ (usecs_since_last / 1000000.0),
+ done_ / ((now - start_) / 1000000.0),
+ (now - last_report_finish_) / 1000000.0,
+ (now - start_) / 1000000.0);
+
+ if (id_ == 0 && FLAGS_stats_per_interval) {
+ std::string stats;
+
+ if (db_with_cfh && db_with_cfh->num_created.load()) {
+ for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) {
+ if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
+ &stats))
+ fprintf(stderr, "%s\n", stats.c_str());
+ if (FLAGS_show_table_properties) {
+ for (int level = 0; level < FLAGS_num_levels; ++level) {
+ if (db->GetProperty(
+ db_with_cfh->cfh[i],
+ "rocksdb.aggregated-table-properties-at-level" +
+ ToString(level),
+ &stats)) {
+ if (stats.find("# entries=0") == std::string::npos) {
+ fprintf(stderr, "Level[%d]: %s\n", level,
+ stats.c_str());
+ }
+ }
+ }
+ }
+ }
+ } else if (db) {
+ if (db->GetProperty("rocksdb.stats", &stats)) {
+ fprintf(stderr, "%s\n", stats.c_str());
+ }
+ if (FLAGS_show_table_properties) {
+ for (int level = 0; level < FLAGS_num_levels; ++level) {
+ if (db->GetProperty(
+ "rocksdb.aggregated-table-properties-at-level" +
+ ToString(level),
+ &stats)) {
+ if (stats.find("# entries=0") == std::string::npos) {
+ fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str());
+ }
+ }
+ }
+ }
+ }
+ }
+
+ next_report_ += FLAGS_stats_interval;
+ last_report_finish_ = now;
+ last_report_done_ = done_;
+ }
+ }
+ if (id_ == 0 && FLAGS_thread_status_per_interval) {
+ PrintThreadStatus();
+ }
+ fflush(stderr);
+ }
+ }
+
+ void AddBytes(int64_t n) {
+ bytes_ += n;
+ }
+
+ void Report(const Slice& name) {
+ // Pretend at least one op was done in case we are running a benchmark
+ // that does not call FinishedOps().
+ if (done_ < 1) done_ = 1;
+
+ std::string extra;
+ if (bytes_ > 0) {
+ // Rate is computed on actual elapsed time, not the sum of per-thread
+ // elapsed times.
+ double elapsed = (finish_ - start_) * 1e-6;
+ char rate[100];
+ snprintf(rate, sizeof(rate), "%6.1f MB/s",
+ (bytes_ / 1048576.0) / elapsed);
+ extra = rate;
+ }
+ AppendWithSpace(&extra, message_);
+ double elapsed = (finish_ - start_) * 1e-6;
+ double throughput = (double)done_/elapsed;
+
+ fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec;%s%s\n",
+ name.ToString().c_str(),
+ seconds_ * 1e6 / done_,
+ (long)throughput,
+ (extra.empty() ? "" : " "),
+ extra.c_str());
+ if (FLAGS_histogram) {
+ for (auto it = hist_.begin(); it != hist_.end(); ++it) {
+ fprintf(stdout, "Microseconds per %s:\n%s\n",
+ OperationTypeString[it->first].c_str(),
+ it->second->ToString().c_str());
+ }
+ }
+ if (FLAGS_report_file_operations) {
+ ReportFileOpEnv* env = static_cast<ReportFileOpEnv*>(FLAGS_env);
+ ReportFileOpCounters* counters = env->counters();
+ fprintf(stdout, "Num files opened: %d\n",
+ counters->open_counter_.load(std::memory_order_relaxed));
+ fprintf(stdout, "Num Read(): %d\n",
+ counters->read_counter_.load(std::memory_order_relaxed));
+ fprintf(stdout, "Num Append(): %d\n",
+ counters->append_counter_.load(std::memory_order_relaxed));
+ fprintf(stdout, "Num bytes read: %" PRIu64 "\n",
+ counters->bytes_read_.load(std::memory_order_relaxed));
+ fprintf(stdout, "Num bytes written: %" PRIu64 "\n",
+ counters->bytes_written_.load(std::memory_order_relaxed));
+ env->reset();
+ }
+ fflush(stdout);
+ }
+};
+
+class CombinedStats {
+ public:
+ void AddStats(const Stats& stat) {
+ uint64_t total_ops = stat.done_;
+ uint64_t total_bytes_ = stat.bytes_;
+ double elapsed;
+
+ if (total_ops < 1) {
+ total_ops = 1;
+ }
+
+ elapsed = (stat.finish_ - stat.start_) * 1e-6;
+ throughput_ops_.emplace_back(total_ops / elapsed);
+
+ if (total_bytes_ > 0) {
+ double mbs = (total_bytes_ / 1048576.0);
+ throughput_mbs_.emplace_back(mbs / elapsed);
+ }
+ }
+
+ void Report(const std::string& bench_name) {
+ const char* name = bench_name.c_str();
+ int num_runs = static_cast<int>(throughput_ops_.size());
+
+ if (throughput_mbs_.size() == throughput_ops_.size()) {
+ fprintf(stdout,
+ "%s [AVG %d runs] : %d ops/sec; %6.1f MB/sec\n"
+ "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
+ name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
+ CalcAvg(throughput_mbs_), name, num_runs,
+ static_cast<int>(CalcMedian(throughput_ops_)),
+ CalcMedian(throughput_mbs_));
+ } else {
+ fprintf(stdout,
+ "%s [AVG %d runs] : %d ops/sec\n"
+ "%s [MEDIAN %d runs] : %d ops/sec\n",
+ name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)), name,
+ num_runs, static_cast<int>(CalcMedian(throughput_ops_)));
+ }
+ }
+
+ private:
+ double CalcAvg(std::vector<double> data) {
+ double avg = 0;
+ for (double x : data) {
+ avg += x;
+ }
+ avg = avg / data.size();
+ return avg;
+ }
+
+ double CalcMedian(std::vector<double> data) {
+ assert(data.size() > 0);
+ std::sort(data.begin(), data.end());
+
+ size_t mid = data.size() / 2;
+ if (data.size() % 2 == 1) {
+ // Odd number of entries
+ return data[mid];
+ } else {
+ // Even number of entries
+ return (data[mid] + data[mid - 1]) / 2;
+ }
+ }
+
+ std::vector<double> throughput_ops_;
+ std::vector<double> throughput_mbs_;
+};
+
+class TimestampEmulator {
+ private:
+ std::atomic<uint64_t> timestamp_;
+
+ public:
+ TimestampEmulator() : timestamp_(0) {}
+ uint64_t Get() const { return timestamp_.load(); }
+ void Inc() { timestamp_++; }
+};
+
+// State shared by all concurrent executions of the same benchmark.
+struct SharedState {
+ port::Mutex mu;
+ port::CondVar cv;
+ int total;
+ int perf_level;
+ std::shared_ptr<RateLimiter> write_rate_limiter;
+ std::shared_ptr<RateLimiter> read_rate_limiter;
+
+ // Each thread goes through the following states:
+ // (1) initializing
+ // (2) waiting for others to be initialized
+ // (3) running
+ // (4) done
+
+ long num_initialized;
+ long num_done;
+ bool start;
+
+ SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+ int tid; // 0..n-1 when running in n threads
+ Random64 rand; // Has different seeds for different threads
+ Stats stats;
+ SharedState* shared;
+
+ /* implicit */ ThreadState(int index)
+ : tid(index),
+ rand((FLAGS_seed ? FLAGS_seed : 1000) + index) {
+ }
+};
+
+class Duration {
+ public:
+ Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) {
+ max_seconds_ = max_seconds;
+ max_ops_= max_ops;
+ ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops;
+ ops_ = 0;
+ start_at_ = FLAGS_env->NowMicros();
+ }
+
+ int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; }
+
+ bool Done(int64_t increment) {
+ if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops
+ ops_ += increment;
+
+ if (max_seconds_) {
+ // Recheck every appx 1000 ops (exact iff increment is factor of 1000)
+ auto granularity = FLAGS_ops_between_duration_checks;
+ if ((ops_ / granularity) != ((ops_ - increment) / granularity)) {
+ uint64_t now = FLAGS_env->NowMicros();
+ return ((now - start_at_) / 1000000) >= max_seconds_;
+ } else {
+ return false;
+ }
+ } else {
+ return ops_ > max_ops_;
+ }
+ }
+
+ private:
+ uint64_t max_seconds_;
+ int64_t max_ops_;
+ int64_t ops_per_stage_;
+ int64_t ops_;
+ uint64_t start_at_;
+};
+
+class Benchmark {
+ private:
+ std::shared_ptr<Cache> cache_;
+ std::shared_ptr<Cache> compressed_cache_;
+ std::shared_ptr<const FilterPolicy> filter_policy_;
+ const SliceTransform* prefix_extractor_;
+ DBWithColumnFamilies db_;
+ std::vector<DBWithColumnFamilies> multi_dbs_;
+ int64_t num_;
+ int key_size_;
+ int prefix_size_;
+ int64_t keys_per_prefix_;
+ int64_t entries_per_batch_;
+ int64_t writes_before_delete_range_;
+ int64_t writes_per_range_tombstone_;
+ int64_t range_tombstone_width_;
+ int64_t max_num_range_tombstones_;
+ WriteOptions write_options_;
+ Options open_options_; // keep options around to properly destroy db later
+#ifndef ROCKSDB_LITE
+ TraceOptions trace_options_;
+ TraceOptions block_cache_trace_options_;
+#endif
+ int64_t reads_;
+ int64_t deletes_;
+ double read_random_exp_range_;
+ int64_t writes_;
+ int64_t readwrites_;
+ int64_t merge_keys_;
+ bool report_file_operations_;
+ bool use_blob_db_;
+ std::vector<std::string> keys_;
+
+ class ErrorHandlerListener : public EventListener {
+ public:
+#ifndef ROCKSDB_LITE
+ ErrorHandlerListener()
+ : mutex_(),
+ cv_(&mutex_),
+ no_auto_recovery_(false),
+ recovery_complete_(false) {}
+
+ ~ErrorHandlerListener() override {}
+
+ void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
+ Status /*bg_error*/,
+ bool* auto_recovery) override {
+ if (*auto_recovery && no_auto_recovery_) {
+ *auto_recovery = false;
+ }
+ }
+
+ void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
+ InstrumentedMutexLock l(&mutex_);
+ recovery_complete_ = true;
+ cv_.SignalAll();
+ }
+
+ bool WaitForRecovery(uint64_t abs_time_us) {
+ InstrumentedMutexLock l(&mutex_);
+ if (!recovery_complete_) {
+ cv_.TimedWait(abs_time_us);
+ }
+ if (recovery_complete_) {
+ recovery_complete_ = false;
+ return true;
+ }
+ return false;
+ }
+
+ void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
+
+ private:
+ InstrumentedMutex mutex_;
+ InstrumentedCondVar cv_;
+ bool no_auto_recovery_;
+ bool recovery_complete_;
+#else // ROCKSDB_LITE
+ bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; }
+ void EnableAutoRecovery(bool /*enable*/) {}
+#endif // ROCKSDB_LITE
+ };
+
+ std::shared_ptr<ErrorHandlerListener> listener_;
+
+ bool SanityCheck() {
+ if (FLAGS_compression_ratio > 1) {
+ fprintf(stderr, "compression_ratio should be between 0 and 1\n");
+ return false;
+ }
+ return true;
+ }
+
+ inline bool CompressSlice(const CompressionInfo& compression_info,
+ const Slice& input, std::string* compressed) {
+ bool ok = true;
+ switch (FLAGS_compression_type_e) {
+ case ROCKSDB_NAMESPACE::kSnappyCompression:
+ ok = Snappy_Compress(compression_info, input.data(), input.size(),
+ compressed);
+ break;
+ case ROCKSDB_NAMESPACE::kZlibCompression:
+ ok = Zlib_Compress(compression_info, 2, input.data(), input.size(),
+ compressed);
+ break;
+ case ROCKSDB_NAMESPACE::kBZip2Compression:
+ ok = BZip2_Compress(compression_info, 2, input.data(), input.size(),
+ compressed);
+ break;
+ case ROCKSDB_NAMESPACE::kLZ4Compression:
+ ok = LZ4_Compress(compression_info, 2, input.data(), input.size(),
+ compressed);
+ break;
+ case ROCKSDB_NAMESPACE::kLZ4HCCompression:
+ ok = LZ4HC_Compress(compression_info, 2, input.data(), input.size(),
+ compressed);
+ break;
+ case ROCKSDB_NAMESPACE::kXpressCompression:
+ ok = XPRESS_Compress(input.data(),
+ input.size(), compressed);
+ break;
+ case ROCKSDB_NAMESPACE::kZSTD:
+ ok = ZSTD_Compress(compression_info, input.data(), input.size(),
+ compressed);
+ break;
+ default:
+ ok = false;
+ }
+ return ok;
+ }
+
+ void PrintHeader() {
+ PrintEnvironment();
+ fprintf(stdout, "Keys: %d bytes each\n", FLAGS_key_size);
+ auto avg_value_size = FLAGS_value_size;
+ if (FLAGS_value_size_distribution_type_e == kFixed) {
+ fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n",
+ avg_value_size,
+ static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
+ } else {
+ avg_value_size = (FLAGS_value_size_min + FLAGS_value_size_max) / 2;
+ fprintf(stdout, "Values: %d avg bytes each (%d bytes after compression)\n",
+ avg_value_size,
+ static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
+ fprintf(stdout, "Values Distribution: %s (min: %d, max: %d)\n",
+ FLAGS_value_size_distribution_type.c_str(),
+ FLAGS_value_size_min, FLAGS_value_size_max);
+ }
+ fprintf(stdout, "Entries: %" PRIu64 "\n", num_);
+ fprintf(stdout, "Prefix: %d bytes\n", FLAGS_prefix_size);
+ fprintf(stdout, "Keys per prefix: %" PRIu64 "\n", keys_per_prefix_);
+ fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
+ ((static_cast<int64_t>(FLAGS_key_size + avg_value_size) * num_)
+ / 1048576.0));
+ fprintf(stdout, "FileSize: %.1f MB (estimated)\n",
+ (((FLAGS_key_size + avg_value_size * FLAGS_compression_ratio)
+ * num_)
+ / 1048576.0));
+ fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n",
+ FLAGS_benchmark_write_rate_limit);
+ fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n",
+ FLAGS_benchmark_read_rate_limit);
+ if (FLAGS_enable_numa) {
+ fprintf(stderr, "Running in NUMA enabled mode.\n");
+#ifndef NUMA
+ fprintf(stderr, "NUMA is not defined in the system.\n");
+ exit(1);
+#else
+ if (numa_available() == -1) {
+ fprintf(stderr, "NUMA is not supported by the system.\n");
+ exit(1);
+ }
+#endif
+ }
+
+ auto compression = CompressionTypeToString(FLAGS_compression_type_e);
+ fprintf(stdout, "Compression: %s\n", compression.c_str());
+ fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
+ FLAGS_sample_for_compression);
+
+ switch (FLAGS_rep_factory) {
+ case kPrefixHash:
+ fprintf(stdout, "Memtablerep: prefix_hash\n");
+ break;
+ case kSkipList:
+ fprintf(stdout, "Memtablerep: skip_list\n");
+ break;
+ case kVectorRep:
+ fprintf(stdout, "Memtablerep: vector\n");
+ break;
+ case kHashLinkedList:
+ fprintf(stdout, "Memtablerep: hash_linkedlist\n");
+ break;
+ }
+ fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
+
+ PrintWarnings(compression.c_str());
+ fprintf(stdout, "------------------------------------------------\n");
+ }
+
+ void PrintWarnings(const char* compression) {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+ fprintf(stdout,
+ "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
+ );
+#endif
+#ifndef NDEBUG
+ fprintf(stdout,
+ "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+ if (FLAGS_compression_type_e != ROCKSDB_NAMESPACE::kNoCompression) {
+ // The test string should not be too small.
+ const int len = FLAGS_block_size;
+ std::string input_str(len, 'y');
+ std::string compressed;
+ CompressionOptions opts;
+ CompressionContext context(FLAGS_compression_type_e);
+ CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+ FLAGS_compression_type_e,
+ FLAGS_sample_for_compression);
+ bool result = CompressSlice(info, Slice(input_str), &compressed);
+
+ if (!result) {
+ fprintf(stdout, "WARNING: %s compression is not enabled\n",
+ compression);
+ } else if (compressed.size() >= input_str.size()) {
+ fprintf(stdout, "WARNING: %s compression is not effective\n",
+ compression);
+ }
+ }
+ }
+
+// Current the following isn't equivalent to OS_LINUX.
+#if defined(__linux)
+ static Slice TrimSpace(Slice s) {
+ unsigned int start = 0;
+ while (start < s.size() && isspace(s[start])) {
+ start++;
+ }
+ unsigned int limit = static_cast<unsigned int>(s.size());
+ while (limit > start && isspace(s[limit-1])) {
+ limit--;
+ }
+ return Slice(s.data() + start, limit - start);
+ }
+#endif
+
+ void PrintEnvironment() {
+ fprintf(stderr, "RocksDB: version %d.%d\n",
+ kMajorVersion, kMinorVersion);
+
+#if defined(__linux)
+ time_t now = time(nullptr);
+ char buf[52];
+ // Lint complains about ctime() usage, so replace it with ctime_r(). The
+ // requirement is to provide a buffer which is at least 26 bytes.
+ fprintf(stderr, "Date: %s",
+ ctime_r(&now, buf)); // ctime_r() adds newline
+
+ FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
+ if (cpuinfo != nullptr) {
+ char line[1000];
+ int num_cpus = 0;
+ std::string cpu_type;
+ std::string cache_size;
+ while (fgets(line, sizeof(line), cpuinfo) != nullptr) {
+ const char* sep = strchr(line, ':');
+ if (sep == nullptr) {
+ continue;
+ }
+ Slice key = TrimSpace(Slice(line, sep - 1 - line));
+ Slice val = TrimSpace(Slice(sep + 1));
+ if (key == "model name") {
+ ++num_cpus;
+ cpu_type = val.ToString();
+ } else if (key == "cache size") {
+ cache_size = val.ToString();
+ }
+ }
+ fclose(cpuinfo);
+ fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str());
+ fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
+ }
+#endif
+ }
+
+ static bool KeyExpired(const TimestampEmulator* timestamp_emulator,
+ const Slice& key) {
+ const char* pos = key.data();
+ pos += 8;
+ uint64_t timestamp = 0;
+ if (port::kLittleEndian) {
+ int bytes_to_fill = 8;
+ for (int i = 0; i < bytes_to_fill; ++i) {
+ timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
+ << ((bytes_to_fill - i - 1) << 3));
+ }
+ } else {
+ memcpy(&timestamp, pos, sizeof(timestamp));
+ }
+ return timestamp_emulator->Get() - timestamp > FLAGS_time_range;
+ }
+
+ class ExpiredTimeFilter : public CompactionFilter {
+ public:
+ explicit ExpiredTimeFilter(
+ const std::shared_ptr<TimestampEmulator>& timestamp_emulator)
+ : timestamp_emulator_(timestamp_emulator) {}
+ bool Filter(int /*level*/, const Slice& key,
+ const Slice& /*existing_value*/, std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ return KeyExpired(timestamp_emulator_.get(), key);
+ }
+ const char* Name() const override { return "ExpiredTimeFilter"; }
+
+ private:
+ std::shared_ptr<TimestampEmulator> timestamp_emulator_;
+ };
+
+ class KeepFilter : public CompactionFilter {
+ public:
+ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+ std::string* /*new_value*/,
+ bool* /*value_changed*/) const override {
+ return false;
+ }
+
+ const char* Name() const override { return "KeepFilter"; }
+ };
+
+ std::shared_ptr<Cache> NewCache(int64_t capacity) {
+ if (capacity <= 0) {
+ return nullptr;
+ }
+ if (FLAGS_use_clock_cache) {
+ auto cache = NewClockCache(static_cast<size_t>(capacity),
+ FLAGS_cache_numshardbits);
+ if (!cache) {
+ fprintf(stderr, "Clock cache not supported.");
+ exit(1);
+ }
+ return cache;
+ } else {
+ return NewLRUCache(
+ static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
+ false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio);
+ }
+ }
+
+ public:
+ Benchmark()
+ : cache_(NewCache(FLAGS_cache_size)),
+ compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
+ filter_policy_(FLAGS_bloom_bits >= 0
+ ? NewBloomFilterPolicy(FLAGS_bloom_bits,
+ FLAGS_use_block_based_filter)
+ : nullptr),
+ prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
+ num_(FLAGS_num),
+ key_size_(FLAGS_key_size),
+ prefix_size_(FLAGS_prefix_size),
+ keys_per_prefix_(FLAGS_keys_per_prefix),
+ entries_per_batch_(1),
+ reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
+ read_random_exp_range_(0.0),
+ writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
+ readwrites_(
+ (FLAGS_writes < 0 && FLAGS_reads < 0)
+ ? FLAGS_num
+ : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)),
+ merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
+ report_file_operations_(FLAGS_report_file_operations),
+#ifndef ROCKSDB_LITE
+ use_blob_db_(FLAGS_use_blob_db)
+#else
+ use_blob_db_(false)
+#endif // !ROCKSDB_LITE
+ {
+ // use simcache instead of cache
+ if (FLAGS_simcache_size >= 0) {
+ if (FLAGS_cache_numshardbits >= 1) {
+ cache_ =
+ NewSimCache(cache_, FLAGS_simcache_size, FLAGS_cache_numshardbits);
+ } else {
+ cache_ = NewSimCache(cache_, FLAGS_simcache_size, 0);
+ }
+ }
+
+ if (report_file_operations_) {
+ if (!FLAGS_hdfs.empty()) {
+ fprintf(stderr,
+ "--hdfs and --report_file_operations cannot be enabled "
+ "at the same time");
+ exit(1);
+ }
+ FLAGS_env = new ReportFileOpEnv(FLAGS_env);
+ }
+
+ if (FLAGS_prefix_size > FLAGS_key_size) {
+ fprintf(stderr, "prefix size is larger than key size");
+ exit(1);
+ }
+
+ std::vector<std::string> files;
+ FLAGS_env->GetChildren(FLAGS_db, &files);
+ for (size_t i = 0; i < files.size(); i++) {
+ if (Slice(files[i]).starts_with("heap-")) {
+ FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
+ }
+ }
+ if (!FLAGS_use_existing_db) {
+ Options options;
+ options.env = FLAGS_env;
+ if (!FLAGS_wal_dir.empty()) {
+ options.wal_dir = FLAGS_wal_dir;
+ }
+#ifndef ROCKSDB_LITE
+ if (use_blob_db_) {
+ blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions());
+ }
+#endif // !ROCKSDB_LITE
+ DestroyDB(FLAGS_db, options);
+ if (!FLAGS_wal_dir.empty()) {
+ FLAGS_env->DeleteDir(FLAGS_wal_dir);
+ }
+
+ if (FLAGS_num_multi_db > 1) {
+ FLAGS_env->CreateDir(FLAGS_db);
+ if (!FLAGS_wal_dir.empty()) {
+ FLAGS_env->CreateDir(FLAGS_wal_dir);
+ }
+ }
+ }
+
+ listener_.reset(new ErrorHandlerListener());
+ }
+
+ ~Benchmark() {
+ db_.DeleteDBs();
+ delete prefix_extractor_;
+ if (cache_.get() != nullptr) {
+ // this will leak, but we're shutting down so nobody cares
+ cache_->DisownData();
+ }
+ }
+
+ Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) {
+ char* data = new char[key_size_];
+ const char* const_data = data;
+ key_guard->reset(const_data);
+ return Slice(key_guard->get(), key_size_);
+ }
+
+ // Generate key according to the given specification and random number.
+ // The resulting key will have the following format (if keys_per_prefix_
+ // is positive), extra trailing bytes are either cut off or padded with '0'.
+ // The prefix value is derived from key value.
+ // ----------------------------
+ // | prefix 00000 | key 00000 |
+ // ----------------------------
+ // If keys_per_prefix_ is 0, the key is simply a binary representation of
+ // random number followed by trailing '0's
+ // ----------------------------
+ // | key 00000 |
+ // ----------------------------
+ void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
+ if (!keys_.empty()) {
+ assert(FLAGS_use_existing_keys);
+ assert(keys_.size() == static_cast<size_t>(num_keys));
+ assert(v < static_cast<uint64_t>(num_keys));
+ *key = keys_[v];
+ return;
+ }
+ char* start = const_cast<char*>(key->data());
+ char* pos = start;
+ if (keys_per_prefix_ > 0) {
+ int64_t num_prefix = num_keys / keys_per_prefix_;
+ int64_t prefix = v % num_prefix;
+ int bytes_to_fill = std::min(prefix_size_, 8);
+ if (port::kLittleEndian) {
+ for (int i = 0; i < bytes_to_fill; ++i) {
+ pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
+ }
+ } else {
+ memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill);
+ }
+ if (prefix_size_ > 8) {
+ // fill the rest with 0s
+ memset(pos + 8, '0', prefix_size_ - 8);
+ }
+ pos += prefix_size_;
+ }
+
+ int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8);
+ if (port::kLittleEndian) {
+ for (int i = 0; i < bytes_to_fill; ++i) {
+ pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
+ }
+ } else {
+ memcpy(pos, static_cast<void*>(&v), bytes_to_fill);
+ }
+ pos += bytes_to_fill;
+ if (key_size_ > pos - start) {
+ memset(pos, '0', key_size_ - (pos - start));
+ }
+ }
+
+ void GenerateKeyFromIntForSeek(uint64_t v, int64_t num_keys, Slice* key) {
+ GenerateKeyFromInt(v, num_keys, key);
+ if (FLAGS_seek_missing_prefix) {
+ assert(prefix_size_ > 8);
+ char* key_ptr = const_cast<char*>(key->data());
+ // This rely on GenerateKeyFromInt filling paddings with '0's.
+ // Putting a '1' will create a non-existing prefix.
+ key_ptr[8] = '1';
+ }
+ }
+
+ std::string GetPathForMultiple(std::string base_name, size_t id) {
+ if (!base_name.empty()) {
+#ifndef OS_WIN
+ if (base_name.back() != '/') {
+ base_name += '/';
+ }
+#else
+ if (base_name.back() != '\\') {
+ base_name += '\\';
+ }
+#endif
+ }
+ return base_name + ToString(id);
+ }
+
+ void VerifyDBFromDB(std::string& truth_db_name) {
+ DBWithColumnFamilies truth_db;
+ auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
+ if (!s.ok()) {
+ fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
+ std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
+ // Verify that all the key/values in truth_db are retrivable in db with
+ // ::Get
+ fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
+ for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
+ std::string value;
+ s = db_.db->Get(ro, truth_iter->key(), &value);
+ assert(s.ok());
+ // TODO(myabandeh): provide debugging hints
+ assert(Slice(value) == truth_iter->value());
+ }
+ // Verify that the db iterator does not give any extra key/value
+ fprintf(stderr, "Verifying db == truth_db...\n");
+ for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid();
+ db_iter->Next(), truth_iter->Next()) {
+ assert(truth_iter->Valid());
+ assert(truth_iter->value() == db_iter->value());
+ }
+ // No more key should be left unchecked in truth_db
+ assert(!truth_iter->Valid());
+ fprintf(stderr, "...Verified\n");
+ }
+
+ void Run() {
+ if (!SanityCheck()) {
+ exit(1);
+ }
+ Open(&open_options_);
+ PrintHeader();
+ std::stringstream benchmark_stream(FLAGS_benchmarks);
+ std::string name;
+ std::unique_ptr<ExpiredTimeFilter> filter;
+ while (std::getline(benchmark_stream, name, ',')) {
+ // Sanitize parameters
+ num_ = FLAGS_num;
+ reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
+ writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
+ deletes_ = (FLAGS_deletes < 0 ? FLAGS_num : FLAGS_deletes);
+ value_size = FLAGS_value_size;
+ key_size_ = FLAGS_key_size;
+ entries_per_batch_ = FLAGS_batch_size;
+ writes_before_delete_range_ = FLAGS_writes_before_delete_range;
+ writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone;
+ range_tombstone_width_ = FLAGS_range_tombstone_width;
+ max_num_range_tombstones_ = FLAGS_max_num_range_tombstones;
+ write_options_ = WriteOptions();
+ read_random_exp_range_ = FLAGS_read_random_exp_range;
+ if (FLAGS_sync) {
+ write_options_.sync = true;
+ }
+ write_options_.disableWAL = FLAGS_disable_wal;
+
+ void (Benchmark::*method)(ThreadState*) = nullptr;
+ void (Benchmark::*post_process_method)() = nullptr;
+
+ bool fresh_db = false;
+ int num_threads = FLAGS_threads;
+
+ int num_repeat = 1;
+ int num_warmup = 0;
+ if (!name.empty() && *name.rbegin() == ']') {
+ auto it = name.find('[');
+ if (it == std::string::npos) {
+ fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str());
+ exit(1);
+ }
+ std::string args = name.substr(it + 1);
+ args.resize(args.size() - 1);
+ name.resize(it);
+
+ std::string bench_arg;
+ std::stringstream args_stream(args);
+ while (std::getline(args_stream, bench_arg, '-')) {
+ if (bench_arg.empty()) {
+ continue;
+ }
+ if (bench_arg[0] == 'X') {
+ // Repeat the benchmark n times
+ std::string num_str = bench_arg.substr(1);
+ num_repeat = std::stoi(num_str);
+ } else if (bench_arg[0] == 'W') {
+ // Warm up the benchmark for n times
+ std::string num_str = bench_arg.substr(1);
+ num_warmup = std::stoi(num_str);
+ }
+ }
+ }
+
+ // Both fillseqdeterministic and filluniquerandomdeterministic
+ // fill the levels except the max level with UNIQUE_RANDOM
+ // and fill the max level with fillseq and filluniquerandom, respectively
+ if (name == "fillseqdeterministic" ||
+ name == "filluniquerandomdeterministic") {
+ if (!FLAGS_disable_auto_compactions) {
+ fprintf(stderr,
+ "Please disable_auto_compactions in FillDeterministic "
+ "benchmark\n");
+ exit(1);
+ }
+ if (num_threads > 1) {
+ fprintf(stderr,
+ "filldeterministic multithreaded not supported"
+ ", use 1 thread\n");
+ num_threads = 1;
+ }
+ fresh_db = true;
+ if (name == "fillseqdeterministic") {
+ method = &Benchmark::WriteSeqDeterministic;
+ } else {
+ method = &Benchmark::WriteUniqueRandomDeterministic;
+ }
+ } else if (name == "fillseq") {
+ fresh_db = true;
+ method = &Benchmark::WriteSeq;
+ } else if (name == "fillbatch") {
+ fresh_db = true;
+ entries_per_batch_ = 1000;
+ method = &Benchmark::WriteSeq;
+ } else if (name == "fillrandom") {
+ fresh_db = true;
+ method = &Benchmark::WriteRandom;
+ } else if (name == "filluniquerandom") {
+ fresh_db = true;
+ if (num_threads > 1) {
+ fprintf(stderr,
+ "filluniquerandom multithreaded not supported"
+ ", use 1 thread");
+ num_threads = 1;
+ }
+ method = &Benchmark::WriteUniqueRandom;
+ } else if (name == "overwrite") {
+ method = &Benchmark::WriteRandom;
+ } else if (name == "fillsync") {
+ fresh_db = true;
+ num_ /= 1000;
+ write_options_.sync = true;
+ method = &Benchmark::WriteRandom;
+ } else if (name == "fill100K") {
+ fresh_db = true;
+ num_ /= 1000;
+ value_size = 100 * 1000;
+ method = &Benchmark::WriteRandom;
+ } else if (name == "readseq") {
+ method = &Benchmark::ReadSequential;
+ } else if (name == "readtorowcache") {
+ if (!FLAGS_use_existing_keys || !FLAGS_row_cache_size) {
+ fprintf(stderr,
+ "Please set use_existing_keys to true and specify a "
+ "row cache size in readtorowcache benchmark\n");
+ exit(1);
+ }
+ method = &Benchmark::ReadToRowCache;
+ } else if (name == "readtocache") {
+ method = &Benchmark::ReadSequential;
+ num_threads = 1;
+ reads_ = num_;
+ } else if (name == "readreverse") {
+ method = &Benchmark::ReadReverse;
+ } else if (name == "readrandom") {
+ if (FLAGS_multiread_stride) {
+ fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
+ entries_per_batch_);
+ }
+ method = &Benchmark::ReadRandom;
+ } else if (name == "readrandomfast") {
+ method = &Benchmark::ReadRandomFast;
+ } else if (name == "multireadrandom") {
+ fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
+ entries_per_batch_);
+ method = &Benchmark::MultiReadRandom;
+ } else if (name == "mixgraph") {
+ method = &Benchmark::MixGraph;
+ } else if (name == "readmissing") {
+ ++key_size_;
+ method = &Benchmark::ReadRandom;
+ } else if (name == "newiterator") {
+ method = &Benchmark::IteratorCreation;
+ } else if (name == "newiteratorwhilewriting") {
+ num_threads++; // Add extra thread for writing
+ method = &Benchmark::IteratorCreationWhileWriting;
+ } else if (name == "seekrandom") {
+ method = &Benchmark::SeekRandom;
+ } else if (name == "seekrandomwhilewriting") {
+ num_threads++; // Add extra thread for writing
+ method = &Benchmark::SeekRandomWhileWriting;
+ } else if (name == "seekrandomwhilemerging") {
+ num_threads++; // Add extra thread for merging
+ method = &Benchmark::SeekRandomWhileMerging;
+ } else if (name == "readrandomsmall") {
+ reads_ /= 1000;
+ method = &Benchmark::ReadRandom;
+ } else if (name == "deleteseq") {
+ method = &Benchmark::DeleteSeq;
+ } else if (name == "deleterandom") {
+ method = &Benchmark::DeleteRandom;
+ } else if (name == "readwhilewriting") {
+ num_threads++; // Add extra thread for writing
+ method = &Benchmark::ReadWhileWriting;
+ } else if (name == "readwhilemerging") {
+ num_threads++; // Add extra thread for writing
+ method = &Benchmark::ReadWhileMerging;
+ } else if (name == "readwhilescanning") {
+ num_threads++; // Add extra thread for scaning
+ method = &Benchmark::ReadWhileScanning;
+ } else if (name == "readrandomwriterandom") {
+ method = &Benchmark::ReadRandomWriteRandom;
+ } else if (name == "readrandommergerandom") {
+ if (FLAGS_merge_operator.empty()) {
+ fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
+ name.c_str());
+ exit(1);
+ }
+ method = &Benchmark::ReadRandomMergeRandom;
+ } else if (name == "updaterandom") {
+ method = &Benchmark::UpdateRandom;
+ } else if (name == "xorupdaterandom") {
+ method = &Benchmark::XORUpdateRandom;
+ } else if (name == "appendrandom") {
+ method = &Benchmark::AppendRandom;
+ } else if (name == "mergerandom") {
+ if (FLAGS_merge_operator.empty()) {
+ fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
+ name.c_str());
+ exit(1);
+ }
+ method = &Benchmark::MergeRandom;
+ } else if (name == "randomwithverify") {
+ method = &Benchmark::RandomWithVerify;
+ } else if (name == "fillseekseq") {
+ method = &Benchmark::WriteSeqSeekSeq;
+ } else if (name == "compact") {
+ method = &Benchmark::Compact;
+ } else if (name == "compactall") {
+ CompactAll();
+ } else if (name == "crc32c") {
+ method = &Benchmark::Crc32c;
+ } else if (name == "xxhash") {
+ method = &Benchmark::xxHash;
+ } else if (name == "acquireload") {
+ method = &Benchmark::AcquireLoad;
+ } else if (name == "compress") {
+ method = &Benchmark::Compress;
+ } else if (name == "uncompress") {
+ method = &Benchmark::Uncompress;
+#ifndef ROCKSDB_LITE
+ } else if (name == "randomtransaction") {
+ method = &Benchmark::RandomTransaction;
+ post_process_method = &Benchmark::RandomTransactionVerify;
+#endif // ROCKSDB_LITE
+ } else if (name == "randomreplacekeys") {
+ fresh_db = true;
+ method = &Benchmark::RandomReplaceKeys;
+ } else if (name == "timeseries") {
+ timestamp_emulator_.reset(new TimestampEmulator());
+ if (FLAGS_expire_style == "compaction_filter") {
+ filter.reset(new ExpiredTimeFilter(timestamp_emulator_));
+ fprintf(stdout, "Compaction filter is used to remove expired data");
+ open_options_.compaction_filter = filter.get();
+ }
+ fresh_db = true;
+ method = &Benchmark::TimeSeries;
+ } else if (name == "stats") {
+ PrintStats("rocksdb.stats");
+ } else if (name == "resetstats") {
+ ResetStats();
+ } else if (name == "verify") {
+ VerifyDBFromDB(FLAGS_truth_db);
+ } else if (name == "levelstats") {
+ PrintStats("rocksdb.levelstats");
+ } else if (name == "sstables") {
+ PrintStats("rocksdb.sstables");
+ } else if (name == "stats_history") {
+ PrintStatsHistory();
+ } else if (name == "replay") {
+ if (num_threads > 1) {
+ fprintf(stderr, "Multi-threaded replay is not yet supported\n");
+ exit(1);
+ }
+ if (FLAGS_trace_file == "") {
+ fprintf(stderr, "Please set --trace_file to be replayed from\n");
+ exit(1);
+ }
+ method = &Benchmark::Replay;
+ } else if (name == "getmergeoperands") {
+ method = &Benchmark::GetMergeOperands;
+ } else if (!name.empty()) { // No error message for empty name
+ fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
+ exit(1);
+ }
+
+ if (fresh_db) {
+ if (FLAGS_use_existing_db) {
+ fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
+ name.c_str());
+ method = nullptr;
+ } else {
+ if (db_.db != nullptr) {
+ db_.DeleteDBs();
+ DestroyDB(FLAGS_db, open_options_);
+ }
+ Options options = open_options_;
+ for (size_t i = 0; i < multi_dbs_.size(); i++) {
+ delete multi_dbs_[i].db;
+ if (!open_options_.wal_dir.empty()) {
+ options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i);
+ }
+ DestroyDB(GetPathForMultiple(FLAGS_db, i), options);
+ }
+ multi_dbs_.clear();
+ }
+ Open(&open_options_); // use open_options for the last accessed
+ }
+
+ if (method != nullptr) {
+ fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
+
+#ifndef ROCKSDB_LITE
+ // A trace_file option can be provided both for trace and replay
+ // operations. But db_bench does not support tracing and replaying at
+ // the same time, for now. So, start tracing only when it is not a
+ // replay.
+ if (FLAGS_trace_file != "" && name != "replay") {
+ std::unique_ptr<TraceWriter> trace_writer;
+ Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
+ FLAGS_trace_file, &trace_writer);
+ if (!s.ok()) {
+ fprintf(stderr, "Encountered an error starting a trace, %s\n",
+ s.ToString().c_str());
+ exit(1);
+ }
+ s = db_.db->StartTrace(trace_options_, std::move(trace_writer));
+ if (!s.ok()) {
+ fprintf(stderr, "Encountered an error starting a trace, %s\n",
+ s.ToString().c_str());
+ exit(1);
+ }
+ fprintf(stdout, "Tracing the workload to: [%s]\n",
+ FLAGS_trace_file.c_str());
+ }
+ // Start block cache tracing.
+ if (!FLAGS_block_cache_trace_file.empty()) {
+ // Sanity checks.
+ if (FLAGS_block_cache_trace_sampling_frequency <= 0) {
+ fprintf(stderr,
+ "Block cache trace sampling frequency must be higher than "
+ "0.\n");
+ exit(1);
+ }
+ if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) {
+ fprintf(stderr,
+ "The maximum file size for block cache tracing must be "
+ "higher than 0.\n");
+ exit(1);
+ }
+ block_cache_trace_options_.max_trace_file_size =
+ FLAGS_block_cache_trace_max_trace_file_size_in_bytes;
+ block_cache_trace_options_.sampling_frequency =
+ FLAGS_block_cache_trace_sampling_frequency;
+ std::unique_ptr<TraceWriter> block_cache_trace_writer;
+ Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
+ FLAGS_block_cache_trace_file,
+ &block_cache_trace_writer);
+ if (!s.ok()) {
+ fprintf(stderr,
+ "Encountered an error when creating trace writer, %s\n",
+ s.ToString().c_str());
+ exit(1);
+ }
+ s = db_.db->StartBlockCacheTrace(block_cache_trace_options_,
+ std::move(block_cache_trace_writer));
+ if (!s.ok()) {
+ fprintf(
+ stderr,
+ "Encountered an error when starting block cache tracing, %s\n",
+ s.ToString().c_str());
+ exit(1);
+ }
+ fprintf(stdout, "Tracing block cache accesses to: [%s]\n",
+ FLAGS_block_cache_trace_file.c_str());
+ }
+#endif // ROCKSDB_LITE
+
+ if (num_warmup > 0) {
+ printf("Warming up benchmark by running %d times\n", num_warmup);
+ }
+
+ for (int i = 0; i < num_warmup; i++) {
+ RunBenchmark(num_threads, name, method);
+ }
+
+ if (num_repeat > 1) {
+ printf("Running benchmark for %d times\n", num_repeat);
+ }
+
+ CombinedStats combined_stats;
+ for (int i = 0; i < num_repeat; i++) {
+ Stats stats = RunBenchmark(num_threads, name, method);
+ combined_stats.AddStats(stats);
+ }
+ if (num_repeat > 1) {
+ combined_stats.Report(name);
+ }
+ }
+ if (post_process_method != nullptr) {
+ (this->*post_process_method)();
+ }
+ }
+
+ if (secondary_update_thread_) {
+ secondary_update_stopped_.store(1, std::memory_order_relaxed);
+ secondary_update_thread_->join();
+ secondary_update_thread_.reset();
+ }
+
+#ifndef ROCKSDB_LITE
+ if (name != "replay" && FLAGS_trace_file != "") {
+ Status s = db_.db->EndTrace();
+ if (!s.ok()) {
+ fprintf(stderr, "Encountered an error ending the trace, %s\n",
+ s.ToString().c_str());
+ }
+ }
+ if (!FLAGS_block_cache_trace_file.empty()) {
+ Status s = db_.db->EndBlockCacheTrace();
+ if (!s.ok()) {
+ fprintf(stderr,
+ "Encountered an error ending the block cache tracing, %s\n",
+ s.ToString().c_str());
+ }
+ }
+#endif // ROCKSDB_LITE
+
+ if (FLAGS_statistics) {
+ fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
+ }
+ if (FLAGS_simcache_size >= 0) {
+ fprintf(stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
+ static_cast_with_check<SimCache, Cache>(cache_.get())
+ ->ToString()
+ .c_str());
+ }
+
+#ifndef ROCKSDB_LITE
+ if (FLAGS_use_secondary_db) {
+ fprintf(stdout, "Secondary instance updated %" PRIu64 " times.\n",
+ secondary_db_updates_);
+ }
+#endif // ROCKSDB_LITE
+ }
+
+ private:
+ std::shared_ptr<TimestampEmulator> timestamp_emulator_;
+ std::unique_ptr<port::Thread> secondary_update_thread_;
+ std::atomic<int> secondary_update_stopped_{0};
+#ifndef ROCKSDB_LITE
+ uint64_t secondary_db_updates_ = 0;
+#endif // ROCKSDB_LITE
+ struct ThreadArg {
+ Benchmark* bm;
+ SharedState* shared;
+ ThreadState* thread;
+ void (Benchmark::*method)(ThreadState*);
+ };
+
+ static void ThreadBody(void* v) {
+ ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
+ SharedState* shared = arg->shared;
+ ThreadState* thread = arg->thread;
+ {
+ MutexLock l(&shared->mu);
+ shared->num_initialized++;
+ if (shared->num_initialized >= shared->total) {
+ shared->cv.SignalAll();
+ }
+ while (!shared->start) {
+ shared->cv.Wait();
+ }
+ }
+
+ SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
+ perf_context.EnablePerLevelPerfContext();
+ thread->stats.Start(thread->tid);
+ (arg->bm->*(arg->method))(thread);
+ thread->stats.Stop();
+
+ {
+ MutexLock l(&shared->mu);
+ shared->num_done++;
+ if (shared->num_done >= shared->total) {
+ shared->cv.SignalAll();
+ }
+ }
+ }
+
+ Stats RunBenchmark(int n, Slice name,
+ void (Benchmark::*method)(ThreadState*)) {
+ SharedState shared;
+ shared.total = n;
+ shared.num_initialized = 0;
+ shared.num_done = 0;
+ shared.start = false;
+ if (FLAGS_benchmark_write_rate_limit > 0) {
+ shared.write_rate_limiter.reset(
+ NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
+ }
+ if (FLAGS_benchmark_read_rate_limit > 0) {
+ shared.read_rate_limiter.reset(NewGenericRateLimiter(
+ FLAGS_benchmark_read_rate_limit, 100000 /* refill_period_us */,
+ 10 /* fairness */, RateLimiter::Mode::kReadsOnly));
+ }
+
+ std::unique_ptr<ReporterAgent> reporter_agent;
+ if (FLAGS_report_interval_seconds > 0) {
+ reporter_agent.reset(new ReporterAgent(FLAGS_env, FLAGS_report_file,
+ FLAGS_report_interval_seconds));
+ }
+
+ ThreadArg* arg = new ThreadArg[n];
+
+ for (int i = 0; i < n; i++) {
+#ifdef NUMA
+ if (FLAGS_enable_numa) {
+ // Performs a local allocation of memory to threads in numa node.
+ int n_nodes = numa_num_task_nodes(); // Number of nodes in NUMA.
+ numa_exit_on_error = 1;
+ int numa_node = i % n_nodes;
+ bitmask* nodes = numa_allocate_nodemask();
+ numa_bitmask_clearall(nodes);
+ numa_bitmask_setbit(nodes, numa_node);
+ // numa_bind() call binds the process to the node and these
+ // properties are passed on to the thread that is created in
+ // StartThread method called later in the loop.
+ numa_bind(nodes);
+ numa_set_strict(1);
+ numa_free_nodemask(nodes);
+ }
+#endif
+ arg[i].bm = this;
+ arg[i].method = method;
+ arg[i].shared = &shared;
+ arg[i].thread = new ThreadState(i);
+ arg[i].thread->stats.SetReporterAgent(reporter_agent.get());
+ arg[i].thread->shared = &shared;
+ FLAGS_env->StartThread(ThreadBody, &arg[i]);
+ }
+
+ shared.mu.Lock();
+ while (shared.num_initialized < n) {
+ shared.cv.Wait();
+ }
+
+ shared.start = true;
+ shared.cv.SignalAll();
+ while (shared.num_done < n) {
+ shared.cv.Wait();
+ }
+ shared.mu.Unlock();
+
+ // Stats for some threads can be excluded.
+ Stats merge_stats;
+ for (int i = 0; i < n; i++) {
+ merge_stats.Merge(arg[i].thread->stats);
+ }
+ merge_stats.Report(name);
+
+ for (int i = 0; i < n; i++) {
+ delete arg[i].thread;
+ }
+ delete[] arg;
+
+ return merge_stats;
+ }
+
+ void Crc32c(ThreadState* thread) {
+ // Checksum about 500MB of data total
+ const int size = FLAGS_block_size; // use --block_size option for db_bench
+ std::string labels = "(" + ToString(FLAGS_block_size) + " per op)";
+ const char* label = labels.c_str();
+
+ std::string data(size, 'x');
+ int64_t bytes = 0;
+ uint32_t crc = 0;
+ while (bytes < 500 * 1048576) {
+ crc = crc32c::Value(data.data(), size);
+ thread->stats.FinishedOps(nullptr, nullptr, 1, kCrc);
+ bytes += size;
+ }
+ // Print so result is not dead
+ fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc));
+
+ thread->stats.AddBytes(bytes);
+ thread->stats.AddMessage(label);
+ }
+
+ void xxHash(ThreadState* thread) {
+ // Checksum about 500MB of data total
+ const int size = 4096;
+ const char* label = "(4K per op)";
+ std::string data(size, 'x');
+ int64_t bytes = 0;
+ unsigned int xxh32 = 0;
+ while (bytes < 500 * 1048576) {
+ xxh32 = XXH32(data.data(), size, 0);
+ thread->stats.FinishedOps(nullptr, nullptr, 1, kHash);
+ bytes += size;
+ }
+ // Print so result is not dead
+ fprintf(stderr, "... xxh32=0x%x\r", static_cast<unsigned int>(xxh32));
+
+ thread->stats.AddBytes(bytes);
+ thread->stats.AddMessage(label);
+ }
+
+ void AcquireLoad(ThreadState* thread) {
+ int dummy;
+ std::atomic<void*> ap(&dummy);
+ int count = 0;
+ void *ptr = nullptr;
+ thread->stats.AddMessage("(each op is 1000 loads)");
+ while (count < 100000) {
+ for (int i = 0; i < 1000; i++) {
+ ptr = ap.load(std::memory_order_acquire);
+ }
+ count++;
+ thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers);
+ }
+ if (ptr == nullptr) exit(1); // Disable unused variable warning.
+ }
+
+ void Compress(ThreadState *thread) {
+ RandomGenerator gen;
+ Slice input = gen.Generate(FLAGS_block_size);
+ int64_t bytes = 0;
+ int64_t produced = 0;
+ bool ok = true;
+ std::string compressed;
+ CompressionOptions opts;
+ CompressionContext context(FLAGS_compression_type_e);
+ CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+ FLAGS_compression_type_e,
+ FLAGS_sample_for_compression);
+ // Compress 1G
+ while (ok && bytes < int64_t(1) << 30) {
+ compressed.clear();
+ ok = CompressSlice(info, input, &compressed);
+ produced += compressed.size();
+ bytes += input.size();
+ thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress);
+ }
+
+ if (!ok) {
+ thread->stats.AddMessage("(compression failure)");
+ } else {
+ char buf[340];
+ snprintf(buf, sizeof(buf), "(output: %.1f%%)",
+ (produced * 100.0) / bytes);
+ thread->stats.AddMessage(buf);
+ thread->stats.AddBytes(bytes);
+ }
+ }
+
+ void Uncompress(ThreadState *thread) {
+ RandomGenerator gen;
+ Slice input = gen.Generate(FLAGS_block_size);
+ std::string compressed;
+
+ CompressionContext compression_ctx(FLAGS_compression_type_e);
+ CompressionOptions compression_opts;
+ CompressionInfo compression_info(
+ compression_opts, compression_ctx, CompressionDict::GetEmptyDict(),
+ FLAGS_compression_type_e, FLAGS_sample_for_compression);
+ UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
+ UncompressionInfo uncompression_info(uncompression_ctx,
+ UncompressionDict::GetEmptyDict(),
+ FLAGS_compression_type_e);
+
+ bool ok = CompressSlice(compression_info, input, &compressed);
+ int64_t bytes = 0;
+ int decompress_size;
+ while (ok && bytes < 1024 * 1048576) {
+ CacheAllocationPtr uncompressed;
+ switch (FLAGS_compression_type_e) {
+ case ROCKSDB_NAMESPACE::kSnappyCompression: {
+ // get size and allocate here to make comparison fair
+ size_t ulength = 0;
+ if (!Snappy_GetUncompressedLength(compressed.data(),
+ compressed.size(), &ulength)) {
+ ok = false;
+ break;
+ }
+ uncompressed = AllocateBlock(ulength, nullptr);
+ ok = Snappy_Uncompress(compressed.data(), compressed.size(),
+ uncompressed.get());
+ break;
+ }
+ case ROCKSDB_NAMESPACE::kZlibCompression:
+ uncompressed =
+ Zlib_Uncompress(uncompression_info, compressed.data(),
+ compressed.size(), &decompress_size, 2);
+ ok = uncompressed.get() != nullptr;
+ break;
+ case ROCKSDB_NAMESPACE::kBZip2Compression:
+ uncompressed = BZip2_Uncompress(compressed.data(), compressed.size(),
+ &decompress_size, 2);
+ ok = uncompressed.get() != nullptr;
+ break;
+ case ROCKSDB_NAMESPACE::kLZ4Compression:
+ uncompressed = LZ4_Uncompress(uncompression_info, compressed.data(),
+ compressed.size(), &decompress_size, 2);
+ ok = uncompressed.get() != nullptr;
+ break;
+ case ROCKSDB_NAMESPACE::kLZ4HCCompression:
+ uncompressed = LZ4_Uncompress(uncompression_info, compressed.data(),
+ compressed.size(), &decompress_size, 2);
+ ok = uncompressed.get() != nullptr;
+ break;
+ case ROCKSDB_NAMESPACE::kXpressCompression:
+ uncompressed.reset(XPRESS_Uncompress(
+ compressed.data(), compressed.size(), &decompress_size));
+ ok = uncompressed.get() != nullptr;
+ break;
+ case ROCKSDB_NAMESPACE::kZSTD:
+ uncompressed = ZSTD_Uncompress(uncompression_info, compressed.data(),
+ compressed.size(), &decompress_size);
+ ok = uncompressed.get() != nullptr;
+ break;
+ default:
+ ok = false;
+ }
+ bytes += input.size();
+ thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
+ }
+
+ if (!ok) {
+ thread->stats.AddMessage("(compression failure)");
+ } else {
+ thread->stats.AddBytes(bytes);
+ }
+ }
+
+ // Returns true if the options is initialized from the specified
+ // options file.
+ bool InitializeOptionsFromFile(Options* opts) {
+#ifndef ROCKSDB_LITE
+ printf("Initializing RocksDB Options from the specified file\n");
+ DBOptions db_opts;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ if (FLAGS_options_file != "") {
+ auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts,
+ &cf_descs);
+ db_opts.env = FLAGS_env;
+ if (s.ok()) {
+ *opts = Options(db_opts, cf_descs[0].options);
+ return true;
+ }
+ fprintf(stderr, "Unable to load options file %s --- %s\n",
+ FLAGS_options_file.c_str(), s.ToString().c_str());
+ exit(1);
+ }
+#else
+ (void)opts;
+#endif
+ return false;
+ }
+
+ void InitializeOptionsFromFlags(Options* opts) {
+ printf("Initializing RocksDB Options from command-line flags\n");
+ Options& options = *opts;
+
+ assert(db_.db == nullptr);
+
+ options.env = FLAGS_env;
+ options.max_open_files = FLAGS_open_files;
+ if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) {
+ options.write_buffer_manager.reset(
+ new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
+ }
+ options.write_buffer_size = FLAGS_write_buffer_size;
+ options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+ options.min_write_buffer_number_to_merge =
+ FLAGS_min_write_buffer_number_to_merge;
+ options.max_write_buffer_number_to_maintain =
+ FLAGS_max_write_buffer_number_to_maintain;
+ options.max_write_buffer_size_to_maintain =
+ FLAGS_max_write_buffer_size_to_maintain;
+ options.max_background_jobs = FLAGS_max_background_jobs;
+ options.max_background_compactions = FLAGS_max_background_compactions;
+ options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
+ options.max_background_flushes = FLAGS_max_background_flushes;
+ options.compaction_style = FLAGS_compaction_style_e;
+ options.compaction_pri = FLAGS_compaction_pri_e;
+ options.allow_mmap_reads = FLAGS_mmap_read;
+ options.allow_mmap_writes = FLAGS_mmap_write;
+ options.use_direct_reads = FLAGS_use_direct_reads;
+ options.use_direct_io_for_flush_and_compaction =
+ FLAGS_use_direct_io_for_flush_and_compaction;
+#ifndef ROCKSDB_LITE
+ options.ttl = FLAGS_fifo_compaction_ttl;
+ options.compaction_options_fifo = CompactionOptionsFIFO(
+ FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
+ FLAGS_fifo_compaction_allow_compaction);
+#endif // ROCKSDB_LITE
+ if (FLAGS_prefix_size != 0) {
+ options.prefix_extractor.reset(
+ NewFixedPrefixTransform(FLAGS_prefix_size));
+ }
+ if (FLAGS_use_uint64_comparator) {
+ options.comparator = test::Uint64Comparator();
+ if (FLAGS_key_size != 8) {
+ fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
+ exit(1);
+ }
+ }
+ if (FLAGS_use_stderr_info_logger) {
+ options.info_log.reset(new StderrLogger());
+ }
+ options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0;
+ options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio;
+ options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
+ if (FLAGS_memtable_insert_with_hint_prefix_size > 0) {
+ options.memtable_insert_with_hint_prefix_extractor.reset(
+ NewCappedPrefixTransform(
+ FLAGS_memtable_insert_with_hint_prefix_size));
+ }
+ options.bloom_locality = FLAGS_bloom_locality;
+ options.max_file_opening_threads = FLAGS_file_opening_threads;
+ options.new_table_reader_for_compaction_inputs =
+ FLAGS_new_table_reader_for_compaction_inputs;
+ options.compaction_readahead_size = FLAGS_compaction_readahead_size;
+ options.log_readahead_size = FLAGS_log_readahead_size;
+ options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size;
+ options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size;
+ options.use_fsync = FLAGS_use_fsync;
+ options.num_levels = FLAGS_num_levels;
+ options.target_file_size_base = FLAGS_target_file_size_base;
+ options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
+ options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+ options.level_compaction_dynamic_level_bytes =
+ FLAGS_level_compaction_dynamic_level_bytes;
+ options.max_bytes_for_level_multiplier =
+ FLAGS_max_bytes_for_level_multiplier;
+ if ((FLAGS_prefix_size == 0) && (FLAGS_rep_factory == kPrefixHash ||
+ FLAGS_rep_factory == kHashLinkedList)) {
+ fprintf(stderr, "prefix_size should be non-zero if PrefixHash or "
+ "HashLinkedList memtablerep is used\n");
+ exit(1);
+ }
+ switch (FLAGS_rep_factory) {
+ case kSkipList:
+ options.memtable_factory.reset(new SkipListFactory(
+ FLAGS_skip_list_lookahead));
+ break;
+#ifndef ROCKSDB_LITE
+ case kPrefixHash:
+ options.memtable_factory.reset(
+ NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
+ break;
+ case kHashLinkedList:
+ options.memtable_factory.reset(NewHashLinkListRepFactory(
+ FLAGS_hash_bucket_count));
+ break;
+ case kVectorRep:
+ options.memtable_factory.reset(
+ new VectorRepFactory
+ );
+ break;
+#else
+ default:
+ fprintf(stderr, "Only skip list is supported in lite mode\n");
+ exit(1);
+#endif // ROCKSDB_LITE
+ }
+ if (FLAGS_use_plain_table) {
+#ifndef ROCKSDB_LITE
+ if (FLAGS_rep_factory != kPrefixHash &&
+ FLAGS_rep_factory != kHashLinkedList) {
+ fprintf(stderr, "Waring: plain table is used with skipList\n");
+ }
+
+ int bloom_bits_per_key = FLAGS_bloom_bits;
+ if (bloom_bits_per_key < 0) {
+ bloom_bits_per_key = 0;
+ }
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = FLAGS_key_size;
+ plain_table_options.bloom_bits_per_key = bloom_bits_per_key;
+ plain_table_options.hash_table_ratio = 0.75;
+ options.table_factory = std::shared_ptr<TableFactory>(
+ NewPlainTableFactory(plain_table_options));
+#else
+ fprintf(stderr, "Plain table is not supported in lite mode\n");
+ exit(1);
+#endif // ROCKSDB_LITE
+ } else if (FLAGS_use_cuckoo_table) {
+#ifndef ROCKSDB_LITE
+ if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
+ fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
+ exit(1);
+ }
+
+ if (!FLAGS_mmap_read) {
+ fprintf(stderr, "cuckoo table format requires mmap read to operate\n");
+ exit(1);
+ }
+
+ ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
+ table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
+ table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
+ options.table_factory = std::shared_ptr<TableFactory>(
+ NewCuckooTableFactory(table_options));
+#else
+ fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
+ exit(1);
+#endif // ROCKSDB_LITE
+ } else {
+ BlockBasedTableOptions block_based_options;
+ if (FLAGS_use_hash_search) {
+ if (FLAGS_prefix_size == 0) {
+ fprintf(stderr,
+ "prefix_size not assigned when enable use_hash_search \n");
+ exit(1);
+ }
+ block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
+ } else {
+ block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
+ }
+ if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
+ if (FLAGS_use_hash_search) {
+ fprintf(stderr,
+ "use_hash_search is incompatible with "
+ "partition index and is ignored");
+ }
+ block_based_options.index_type =
+ BlockBasedTableOptions::kTwoLevelIndexSearch;
+ block_based_options.metadata_block_size = FLAGS_metadata_block_size;
+ if (FLAGS_partition_index_and_filters) {
+ block_based_options.partition_filters = true;
+ }
+ }
+ if (cache_ == nullptr) {
+ block_based_options.no_block_cache = true;
+ }
+ block_based_options.cache_index_and_filter_blocks =
+ FLAGS_cache_index_and_filter_blocks;
+ block_based_options.pin_l0_filter_and_index_blocks_in_cache =
+ FLAGS_pin_l0_filter_and_index_blocks_in_cache;
+ block_based_options.pin_top_level_index_and_filter =
+ FLAGS_pin_top_level_index_and_filter;
+ if (FLAGS_cache_high_pri_pool_ratio > 1e-6) { // > 0.0 + eps
+ block_based_options.cache_index_and_filter_blocks_with_high_priority =
+ true;
+ }
+ block_based_options.block_cache = cache_;
+ block_based_options.block_cache_compressed = compressed_cache_;
+ block_based_options.block_size = FLAGS_block_size;
+ block_based_options.block_restart_interval = FLAGS_block_restart_interval;
+ block_based_options.index_block_restart_interval =
+ FLAGS_index_block_restart_interval;
+ block_based_options.filter_policy = filter_policy_;
+ block_based_options.format_version =
+ static_cast<uint32_t>(FLAGS_format_version);
+ block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
+ block_based_options.enable_index_compression =
+ FLAGS_enable_index_compression;
+ block_based_options.block_align = FLAGS_block_align;
+ if (FLAGS_use_data_block_hash_index) {
+ block_based_options.data_block_index_type =
+ ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash;
+ } else {
+ block_based_options.data_block_index_type =
+ ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinarySearch;
+ }
+ block_based_options.data_block_hash_table_util_ratio =
+ FLAGS_data_block_hash_table_util_ratio;
+ if (FLAGS_read_cache_path != "") {
+#ifndef ROCKSDB_LITE
+ Status rc_status;
+
+ // Read cache need to be provided with a the Logger, we will put all
+ // reac cache logs in the read cache path in a file named rc_LOG
+ rc_status = FLAGS_env->CreateDirIfMissing(FLAGS_read_cache_path);
+ std::shared_ptr<Logger> read_cache_logger;
+ if (rc_status.ok()) {
+ rc_status = FLAGS_env->NewLogger(FLAGS_read_cache_path + "/rc_LOG",
+ &read_cache_logger);
+ }
+
+ if (rc_status.ok()) {
+ PersistentCacheConfig rc_cfg(FLAGS_env, FLAGS_read_cache_path,
+ FLAGS_read_cache_size,
+ read_cache_logger);
+
+ rc_cfg.enable_direct_reads = FLAGS_read_cache_direct_read;
+ rc_cfg.enable_direct_writes = FLAGS_read_cache_direct_write;
+ rc_cfg.writer_qdepth = 4;
+ rc_cfg.writer_dispatch_size = 4 * 1024;
+
+ auto pcache = std::make_shared<BlockCacheTier>(rc_cfg);
+ block_based_options.persistent_cache = pcache;
+ rc_status = pcache->Open();
+ }
+
+ if (!rc_status.ok()) {
+ fprintf(stderr, "Error initializing read cache, %s\n",
+ rc_status.ToString().c_str());
+ exit(1);
+ }
+#else
+ fprintf(stderr, "Read cache is not supported in LITE\n");
+ exit(1);
+
+#endif
+ }
+ options.table_factory.reset(
+ NewBlockBasedTableFactory(block_based_options));
+ }
+ if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
+ if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
+ static_cast<unsigned int>(FLAGS_num_levels)) {
+ fprintf(stderr, "Insufficient number of fanouts specified %d\n",
+ static_cast<int>(
+ FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
+ exit(1);
+ }
+ options.max_bytes_for_level_multiplier_additional =
+ FLAGS_max_bytes_for_level_multiplier_additional_v;
+ }
+ options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
+ options.level0_file_num_compaction_trigger =
+ FLAGS_level0_file_num_compaction_trigger;
+ options.level0_slowdown_writes_trigger =
+ FLAGS_level0_slowdown_writes_trigger;
+ options.compression = FLAGS_compression_type_e;
+ options.sample_for_compression = FLAGS_sample_for_compression;
+ options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
+ options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
+ options.max_total_wal_size = FLAGS_max_total_wal_size;
+
+ if (FLAGS_min_level_to_compress >= 0) {
+ assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
+ options.compression_per_level.resize(FLAGS_num_levels);
+ for (int i = 0; i < FLAGS_min_level_to_compress; i++) {
+ options.compression_per_level[i] = kNoCompression;
+ }
+ for (int i = FLAGS_min_level_to_compress;
+ i < FLAGS_num_levels; i++) {
+ options.compression_per_level[i] = FLAGS_compression_type_e;
+ }
+ }
+ options.soft_rate_limit = FLAGS_soft_rate_limit;
+ options.hard_rate_limit = FLAGS_hard_rate_limit;
+ options.soft_pending_compaction_bytes_limit =
+ FLAGS_soft_pending_compaction_bytes_limit;
+ options.hard_pending_compaction_bytes_limit =
+ FLAGS_hard_pending_compaction_bytes_limit;
+ options.delayed_write_rate = FLAGS_delayed_write_rate;
+ options.allow_concurrent_memtable_write =
+ FLAGS_allow_concurrent_memtable_write;
+ options.inplace_update_support = FLAGS_inplace_update_support;
+ options.inplace_update_num_locks = FLAGS_inplace_update_num_locks;
+ options.enable_write_thread_adaptive_yield =
+ FLAGS_enable_write_thread_adaptive_yield;
+ options.enable_pipelined_write = FLAGS_enable_pipelined_write;
+ options.unordered_write = FLAGS_unordered_write;
+ options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec;
+ options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec;
+ options.rate_limit_delay_max_milliseconds =
+ FLAGS_rate_limit_delay_max_milliseconds;
+ options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
+ options.max_compaction_bytes = FLAGS_max_compaction_bytes;
+ options.disable_auto_compactions = FLAGS_disable_auto_compactions;
+ options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
+
+ // fill storage options
+ options.advise_random_on_open = FLAGS_advise_random_on_open;
+ options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
+ options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
+ options.bytes_per_sync = FLAGS_bytes_per_sync;
+ options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
+
+ // merge operator options
+ options.merge_operator = MergeOperators::CreateFromStringId(
+ FLAGS_merge_operator);
+ if (options.merge_operator == nullptr && !FLAGS_merge_operator.empty()) {
+ fprintf(stderr, "invalid merge operator: %s\n",
+ FLAGS_merge_operator.c_str());
+ exit(1);
+ }
+ options.max_successive_merges = FLAGS_max_successive_merges;
+ options.report_bg_io_stats = FLAGS_report_bg_io_stats;
+
+ // set universal style compaction configurations, if applicable
+ if (FLAGS_universal_size_ratio != 0) {
+ options.compaction_options_universal.size_ratio =
+ FLAGS_universal_size_ratio;
+ }
+ if (FLAGS_universal_min_merge_width != 0) {
+ options.compaction_options_universal.min_merge_width =
+ FLAGS_universal_min_merge_width;
+ }
+ if (FLAGS_universal_max_merge_width != 0) {
+ options.compaction_options_universal.max_merge_width =
+ FLAGS_universal_max_merge_width;
+ }
+ if (FLAGS_universal_max_size_amplification_percent != 0) {
+ options.compaction_options_universal.max_size_amplification_percent =
+ FLAGS_universal_max_size_amplification_percent;
+ }
+ if (FLAGS_universal_compression_size_percent != -1) {
+ options.compaction_options_universal.compression_size_percent =
+ FLAGS_universal_compression_size_percent;
+ }
+ options.compaction_options_universal.allow_trivial_move =
+ FLAGS_universal_allow_trivial_move;
+ if (FLAGS_thread_status_per_interval > 0) {
+ options.enable_thread_tracking = true;
+ }
+
+#ifndef ROCKSDB_LITE
+ if (FLAGS_readonly && FLAGS_transaction_db) {
+ fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
+ exit(1);
+ }
+ if (FLAGS_use_secondary_db &&
+ (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
+ fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
+ exit(1);
+ }
+#endif // ROCKSDB_LITE
+
+ }
+
+ void InitializeOptionsGeneral(Options* opts) {
+ Options& options = *opts;
+
+ options.create_missing_column_families = FLAGS_num_column_families > 1;
+ options.statistics = dbstats;
+ options.wal_dir = FLAGS_wal_dir;
+ options.create_if_missing = !FLAGS_use_existing_db;
+ options.dump_malloc_stats = FLAGS_dump_malloc_stats;
+ options.stats_dump_period_sec =
+ static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
+ options.stats_persist_period_sec =
+ static_cast<unsigned int>(FLAGS_stats_persist_period_sec);
+ options.persist_stats_to_disk = FLAGS_persist_stats_to_disk;
+ options.stats_history_buffer_size =
+ static_cast<size_t>(FLAGS_stats_history_buffer_size);
+
+ options.compression_opts.level = FLAGS_compression_level;
+ options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
+ options.compression_opts.zstd_max_train_bytes =
+ FLAGS_compression_zstd_max_train_bytes;
+ // If this is a block based table, set some related options
+ if (options.table_factory->Name() == BlockBasedTableFactory::kName &&
+ options.table_factory->GetOptions() != nullptr) {
+ BlockBasedTableOptions* table_options =
+ reinterpret_cast<BlockBasedTableOptions*>(
+ options.table_factory->GetOptions());
+ if (FLAGS_cache_size) {
+ table_options->block_cache = cache_;
+ }
+ if (FLAGS_bloom_bits >= 0) {
+ table_options->filter_policy.reset(NewBloomFilterPolicy(
+ FLAGS_bloom_bits, FLAGS_use_block_based_filter));
+ }
+ }
+ if (FLAGS_row_cache_size) {
+ if (FLAGS_cache_numshardbits >= 1) {
+ options.row_cache =
+ NewLRUCache(FLAGS_row_cache_size, FLAGS_cache_numshardbits);
+ } else {
+ options.row_cache = NewLRUCache(FLAGS_row_cache_size);
+ }
+ }
+ if (FLAGS_enable_io_prio) {
+ FLAGS_env->LowerThreadPoolIOPriority(Env::LOW);
+ FLAGS_env->LowerThreadPoolIOPriority(Env::HIGH);
+ }
+ if (FLAGS_enable_cpu_prio) {
+ FLAGS_env->LowerThreadPoolCPUPriority(Env::LOW);
+ FLAGS_env->LowerThreadPoolCPUPriority(Env::HIGH);
+ }
+ options.env = FLAGS_env;
+ if (FLAGS_sine_write_rate) {
+ FLAGS_benchmark_write_rate_limit = static_cast<uint64_t>(SineRate(0));
+ }
+
+ if (FLAGS_rate_limiter_bytes_per_sec > 0) {
+ if (FLAGS_rate_limit_bg_reads &&
+ !FLAGS_new_table_reader_for_compaction_inputs) {
+ fprintf(stderr,
+ "rate limit compaction reads must have "
+ "new_table_reader_for_compaction_inputs set\n");
+ exit(1);
+ }
+ options.rate_limiter.reset(NewGenericRateLimiter(
+ FLAGS_rate_limiter_bytes_per_sec, 100 * 1000 /* refill_period_us */,
+ 10 /* fairness */,
+ FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
+ : RateLimiter::Mode::kWritesOnly,
+ FLAGS_rate_limiter_auto_tuned));
+ }
+
+ options.listeners.emplace_back(listener_);
+ if (FLAGS_num_multi_db <= 1) {
+ OpenDb(options, FLAGS_db, &db_);
+ } else {
+ multi_dbs_.clear();
+ multi_dbs_.resize(FLAGS_num_multi_db);
+ auto wal_dir = options.wal_dir;
+ for (int i = 0; i < FLAGS_num_multi_db; i++) {
+ if (!wal_dir.empty()) {
+ options.wal_dir = GetPathForMultiple(wal_dir, i);
+ }
+ OpenDb(options, GetPathForMultiple(FLAGS_db, i), &multi_dbs_[i]);
+ }
+ options.wal_dir = wal_dir;
+ }
+
+ // KeepFilter is a noop filter, this can be used to test compaction filter
+ if (FLAGS_use_keep_filter) {
+ options.compaction_filter = new KeepFilter();
+ fprintf(stdout, "A noop compaction filter is used\n");
+ }
+
+ if (FLAGS_use_existing_keys) {
+ // Only work on single database
+ assert(db_.db != nullptr);
+ ReadOptions read_opts;
+ read_opts.total_order_seek = true;
+ Iterator* iter = db_.db->NewIterator(read_opts);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ keys_.emplace_back(iter->key().ToString());
+ }
+ delete iter;
+ FLAGS_num = keys_.size();
+ }
+ }
+
+ void Open(Options* opts) {
+ if (!InitializeOptionsFromFile(opts)) {
+ InitializeOptionsFromFlags(opts);
+ }
+
+ InitializeOptionsGeneral(opts);
+ }
+
+ void OpenDb(Options options, const std::string& db_name,
+ DBWithColumnFamilies* db) {
+ Status s;
+ // Open with column families if necessary.
+ if (FLAGS_num_column_families > 1) {
+ size_t num_hot = FLAGS_num_column_families;
+ if (FLAGS_num_hot_column_families > 0 &&
+ FLAGS_num_hot_column_families < FLAGS_num_column_families) {
+ num_hot = FLAGS_num_hot_column_families;
+ } else {
+ FLAGS_num_hot_column_families = FLAGS_num_column_families;
+ }
+ std::vector<ColumnFamilyDescriptor> column_families;
+ for (size_t i = 0; i < num_hot; i++) {
+ column_families.push_back(ColumnFamilyDescriptor(
+ ColumnFamilyName(i), ColumnFamilyOptions(options)));
+ }
+ std::vector<int> cfh_idx_to_prob;
+ if (!FLAGS_column_family_distribution.empty()) {
+ std::stringstream cf_prob_stream(FLAGS_column_family_distribution);
+ std::string cf_prob;
+ int sum = 0;
+ while (std::getline(cf_prob_stream, cf_prob, ',')) {
+ cfh_idx_to_prob.push_back(std::stoi(cf_prob));
+ sum += cfh_idx_to_prob.back();
+ }
+ if (sum != 100) {
+ fprintf(stderr, "column_family_distribution items must sum to 100\n");
+ exit(1);
+ }
+ if (cfh_idx_to_prob.size() != num_hot) {
+ fprintf(stderr,
+ "got %" ROCKSDB_PRIszt
+ " column_family_distribution items; expected "
+ "%" ROCKSDB_PRIszt "\n",
+ cfh_idx_to_prob.size(), num_hot);
+ exit(1);
+ }
+ }
+#ifndef ROCKSDB_LITE
+ if (FLAGS_readonly) {
+ s = DB::OpenForReadOnly(options, db_name, column_families,
+ &db->cfh, &db->db);
+ } else if (FLAGS_optimistic_transaction_db) {
+ s = OptimisticTransactionDB::Open(options, db_name, column_families,
+ &db->cfh, &db->opt_txn_db);
+ if (s.ok()) {
+ db->db = db->opt_txn_db->GetBaseDB();
+ }
+ } else if (FLAGS_transaction_db) {
+ TransactionDB* ptr;
+ TransactionDBOptions txn_db_options;
+ if (options.unordered_write) {
+ options.two_write_queues = true;
+ txn_db_options.skip_concurrency_control = true;
+ txn_db_options.write_policy = WRITE_PREPARED;
+ }
+ s = TransactionDB::Open(options, txn_db_options, db_name,
+ column_families, &db->cfh, &ptr);
+ if (s.ok()) {
+ db->db = ptr;
+ }
+ } else {
+ s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
+ }
+#else
+ s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
+#endif // ROCKSDB_LITE
+ db->cfh.resize(FLAGS_num_column_families);
+ db->num_created = num_hot;
+ db->num_hot = num_hot;
+ db->cfh_idx_to_prob = std::move(cfh_idx_to_prob);
+#ifndef ROCKSDB_LITE
+ } else if (FLAGS_readonly) {
+ s = DB::OpenForReadOnly(options, db_name, &db->db);
+ } else if (FLAGS_optimistic_transaction_db) {
+ s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db);
+ if (s.ok()) {
+ db->db = db->opt_txn_db->GetBaseDB();
+ }
+ } else if (FLAGS_transaction_db) {
+ TransactionDB* ptr = nullptr;
+ TransactionDBOptions txn_db_options;
+ if (options.unordered_write) {
+ options.two_write_queues = true;
+ txn_db_options.skip_concurrency_control = true;
+ txn_db_options.write_policy = WRITE_PREPARED;
+ }
+ s = CreateLoggerFromOptions(db_name, options, &options.info_log);
+ if (s.ok()) {
+ s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
+ }
+ if (s.ok()) {
+ db->db = ptr;
+ }
+ } else if (FLAGS_use_blob_db) {
+ blob_db::BlobDBOptions blob_db_options;
+ blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
+ blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
+ blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
+ blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
+ blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
+ blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
+ blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
+ blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
+ blob_db_options.compression = FLAGS_blob_db_compression_type_e;
+ blob_db::BlobDB* ptr = nullptr;
+ s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
+ if (s.ok()) {
+ db->db = ptr;
+ }
+ } else if (FLAGS_use_secondary_db) {
+ if (FLAGS_secondary_path.empty()) {
+ std::string default_secondary_path;
+ FLAGS_env->GetTestDirectory(&default_secondary_path);
+ default_secondary_path += "/dbbench_secondary";
+ FLAGS_secondary_path = default_secondary_path;
+ }
+ s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db);
+ if (s.ok() && FLAGS_secondary_update_interval > 0) {
+ secondary_update_thread_.reset(new port::Thread(
+ [this](int interval, DBWithColumnFamilies* _db) {
+ while (0 == secondary_update_stopped_.load(
+ std::memory_order_relaxed)) {
+ Status secondary_update_status =
+ _db->db->TryCatchUpWithPrimary();
+ if (!secondary_update_status.ok()) {
+ fprintf(stderr, "Failed to catch up with primary: %s\n",
+ secondary_update_status.ToString().c_str());
+ break;
+ }
+ ++secondary_db_updates_;
+ FLAGS_env->SleepForMicroseconds(interval * 1000000);
+ }
+ },
+ FLAGS_secondary_update_interval, db));
+ }
+#endif // ROCKSDB_LITE
+ } else {
+ s = DB::Open(options, db_name, &db->db);
+ }
+ if (!s.ok()) {
+ fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ }
+
+ enum WriteMode {
+ RANDOM, SEQUENTIAL, UNIQUE_RANDOM
+ };
+
+ void WriteSeqDeterministic(ThreadState* thread) {
+ DoDeterministicCompact(thread, open_options_.compaction_style, SEQUENTIAL);
+ }
+
+ void WriteUniqueRandomDeterministic(ThreadState* thread) {
+ DoDeterministicCompact(thread, open_options_.compaction_style,
+ UNIQUE_RANDOM);
+ }
+
+ void WriteSeq(ThreadState* thread) {
+ DoWrite(thread, SEQUENTIAL);
+ }
+
+ void WriteRandom(ThreadState* thread) {
+ DoWrite(thread, RANDOM);
+ }
+
+ void WriteUniqueRandom(ThreadState* thread) {
+ DoWrite(thread, UNIQUE_RANDOM);
+ }
+
+ class KeyGenerator {
+ public:
+ KeyGenerator(Random64* rand, WriteMode mode, uint64_t num,
+ uint64_t /*num_per_set*/ = 64 * 1024)
+ : rand_(rand), mode_(mode), num_(num), next_(0) {
+ if (mode_ == UNIQUE_RANDOM) {
+ // NOTE: if memory consumption of this approach becomes a concern,
+ // we can either break it into pieces and only random shuffle a section
+ // each time. Alternatively, use a bit map implementation
+ // (https://reviews.facebook.net/differential/diff/54627/)
+ values_.resize(num_);
+ for (uint64_t i = 0; i < num_; ++i) {
+ values_[i] = i;
+ }
+ std::shuffle(
+ values_.begin(), values_.end(),
+ std::default_random_engine(static_cast<unsigned int>(FLAGS_seed)));
+ }
+ }
+
+ uint64_t Next() {
+ switch (mode_) {
+ case SEQUENTIAL:
+ return next_++;
+ case RANDOM:
+ return rand_->Next() % num_;
+ case UNIQUE_RANDOM:
+ assert(next_ < num_);
+ return values_[next_++];
+ }
+ assert(false);
+ return std::numeric_limits<uint64_t>::max();
+ }
+
+ private:
+ Random64* rand_;
+ WriteMode mode_;
+ const uint64_t num_;
+ uint64_t next_;
+ std::vector<uint64_t> values_;
+ };
+
+ DB* SelectDB(ThreadState* thread) {
+ return SelectDBWithCfh(thread)->db;
+ }
+
+ DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) {
+ return SelectDBWithCfh(thread->rand.Next());
+ }
+
+ DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) {
+ if (db_.db != nullptr) {
+ return &db_;
+ } else {
+ return &multi_dbs_[rand_int % multi_dbs_.size()];
+ }
+ }
+
+ double SineRate(double x) {
+ return FLAGS_sine_a*sin((FLAGS_sine_b*x) + FLAGS_sine_c) + FLAGS_sine_d;
+ }
+
+ void DoWrite(ThreadState* thread, WriteMode write_mode) {
+ const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
+ const int64_t num_ops = writes_ == 0 ? num_ : writes_;
+
+ size_t num_key_gens = 1;
+ if (db_.db == nullptr) {
+ num_key_gens = multi_dbs_.size();
+ }
+ std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
+ int64_t max_ops = num_ops * num_key_gens;
+ int64_t ops_per_stage = max_ops;
+ if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) {
+ ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families /
+ FLAGS_num_hot_column_families) +
+ 1;
+ }
+
+ Duration duration(test_duration, max_ops, ops_per_stage);
+ for (size_t i = 0; i < num_key_gens; i++) {
+ key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode,
+ num_ + max_num_range_tombstones_,
+ ops_per_stage));
+ }
+
+ if (num_ != FLAGS_num) {
+ char msg[100];
+ snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_);
+ thread->stats.AddMessage(msg);
+ }
+
+ RandomGenerator gen;
+ WriteBatch batch;
+ Status s;
+ int64_t bytes = 0;
+
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+ std::unique_ptr<const char[]> begin_key_guard;
+ Slice begin_key = AllocateKey(&begin_key_guard);
+ std::unique_ptr<const char[]> end_key_guard;
+ Slice end_key = AllocateKey(&end_key_guard);
+ std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
+ std::vector<Slice> expanded_keys;
+ if (FLAGS_expand_range_tombstones) {
+ expanded_key_guards.resize(range_tombstone_width_);
+ for (auto& expanded_key_guard : expanded_key_guards) {
+ expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
+ }
+ }
+
+ int64_t stage = 0;
+ int64_t num_written = 0;
+ while (!duration.Done(entries_per_batch_)) {
+ if (duration.GetStage() != stage) {
+ stage = duration.GetStage();
+ if (db_.db != nullptr) {
+ db_.CreateNewCf(open_options_, stage);
+ } else {
+ for (auto& db : multi_dbs_) {
+ db.CreateNewCf(open_options_, stage);
+ }
+ }
+ }
+
+ size_t id = thread->rand.Next() % num_key_gens;
+ DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
+ batch.Clear();
+ int64_t batch_bytes = 0;
+
+ for (int64_t j = 0; j < entries_per_batch_; j++) {
+ int64_t rand_num = key_gens[id]->Next();
+ GenerateKeyFromInt(rand_num, FLAGS_num, &key);
+ Slice val = gen.Generate();
+ if (use_blob_db_) {
+#ifndef ROCKSDB_LITE
+ blob_db::BlobDB* blobdb =
+ static_cast<blob_db::BlobDB*>(db_with_cfh->db);
+ if (FLAGS_blob_db_max_ttl_range > 0) {
+ int ttl = rand() % FLAGS_blob_db_max_ttl_range;
+ s = blobdb->PutWithTTL(write_options_, key, val, ttl);
+ } else {
+ s = blobdb->Put(write_options_, key, val);
+ }
+#endif // ROCKSDB_LITE
+ } else if (FLAGS_num_column_families <= 1) {
+ batch.Put(key, val);
+ } else {
+ // We use same rand_num as seed for key and column family so that we
+ // can deterministically find the cfh corresponding to a particular
+ // key while reading the key.
+ batch.Put(db_with_cfh->GetCfh(rand_num), key,
+ val);
+ }
+ batch_bytes += val.size() + key_size_;
+ bytes += val.size() + key_size_;
+ ++num_written;
+ if (writes_per_range_tombstone_ > 0 &&
+ num_written > writes_before_delete_range_ &&
+ (num_written - writes_before_delete_range_) /
+ writes_per_range_tombstone_ <=
+ max_num_range_tombstones_ &&
+ (num_written - writes_before_delete_range_) %
+ writes_per_range_tombstone_ ==
+ 0) {
+ int64_t begin_num = key_gens[id]->Next();
+ if (FLAGS_expand_range_tombstones) {
+ for (int64_t offset = 0; offset < range_tombstone_width_;
+ ++offset) {
+ GenerateKeyFromInt(begin_num + offset, FLAGS_num,
+ &expanded_keys[offset]);
+ if (use_blob_db_) {
+#ifndef ROCKSDB_LITE
+ s = db_with_cfh->db->Delete(write_options_,
+ expanded_keys[offset]);
+#endif // ROCKSDB_LITE
+ } else if (FLAGS_num_column_families <= 1) {
+ batch.Delete(expanded_keys[offset]);
+ } else {
+ batch.Delete(db_with_cfh->GetCfh(rand_num),
+ expanded_keys[offset]);
+ }
+ }
+ } else {
+ GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
+ GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
+ &end_key);
+ if (use_blob_db_) {
+#ifndef ROCKSDB_LITE
+ s = db_with_cfh->db->DeleteRange(
+ write_options_, db_with_cfh->db->DefaultColumnFamily(),
+ begin_key, end_key);
+#endif // ROCKSDB_LITE
+ } else if (FLAGS_num_column_families <= 1) {
+ batch.DeleteRange(begin_key, end_key);
+ } else {
+ batch.DeleteRange(db_with_cfh->GetCfh(rand_num), begin_key,
+ end_key);
+ }
+ }
+ }
+ }
+ if (thread->shared->write_rate_limiter.get() != nullptr) {
+ thread->shared->write_rate_limiter->Request(
+ batch_bytes, Env::IO_HIGH,
+ nullptr /* stats */, RateLimiter::OpType::kWrite);
+ // Set time at which last op finished to Now() to hide latency and
+ // sleep from rate limiter. Also, do the check once per batch, not
+ // once per write.
+ thread->stats.ResetLastOpTime();
+ }
+ if (!use_blob_db_) {
+ s = db_with_cfh->db->Write(write_options_, &batch);
+ }
+ thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
+ entries_per_batch_, kWrite);
+ if (FLAGS_sine_write_rate) {
+ uint64_t now = FLAGS_env->NowMicros();
+
+ uint64_t usecs_since_last;
+ if (now > thread->stats.GetSineInterval()) {
+ usecs_since_last = now - thread->stats.GetSineInterval();
+ } else {
+ usecs_since_last = 0;
+ }
+
+ if (usecs_since_last >
+ (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) {
+ double usecs_since_start =
+ static_cast<double>(now - thread->stats.GetStart());
+ thread->stats.ResetSineInterval();
+ uint64_t write_rate =
+ static_cast<uint64_t>(SineRate(usecs_since_start / 1000000.0));
+ thread->shared->write_rate_limiter.reset(
+ NewGenericRateLimiter(write_rate));
+ }
+ }
+ if (!s.ok()) {
+ s = listener_->WaitForRecovery(600000000) ? Status::OK() : s;
+ }
+
+ if (!s.ok()) {
+ fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ }
+ thread->stats.AddBytes(bytes);
+ }
+
+ Status DoDeterministicCompact(ThreadState* thread,
+ CompactionStyle compaction_style,
+ WriteMode write_mode) {
+#ifndef ROCKSDB_LITE
+ ColumnFamilyMetaData meta;
+ std::vector<DB*> db_list;
+ if (db_.db != nullptr) {
+ db_list.push_back(db_.db);
+ } else {
+ for (auto& db : multi_dbs_) {
+ db_list.push_back(db.db);
+ }
+ }
+ std::vector<Options> options_list;
+ for (auto db : db_list) {
+ options_list.push_back(db->GetOptions());
+ if (compaction_style != kCompactionStyleFIFO) {
+ db->SetOptions({{"disable_auto_compactions", "1"},
+ {"level0_slowdown_writes_trigger", "400000000"},
+ {"level0_stop_writes_trigger", "400000000"}});
+ } else {
+ db->SetOptions({{"disable_auto_compactions", "1"}});
+ }
+ }
+
+ assert(!db_list.empty());
+ auto num_db = db_list.size();
+ size_t num_levels = static_cast<size_t>(open_options_.num_levels);
+ size_t output_level = open_options_.num_levels - 1;
+ std::vector<std::vector<std::vector<SstFileMetaData>>> sorted_runs(num_db);
+ std::vector<size_t> num_files_at_level0(num_db, 0);
+ if (compaction_style == kCompactionStyleLevel) {
+ if (num_levels == 0) {
+ return Status::InvalidArgument("num_levels should be larger than 1");
+ }
+ bool should_stop = false;
+ while (!should_stop) {
+ if (sorted_runs[0].empty()) {
+ DoWrite(thread, write_mode);
+ } else {
+ DoWrite(thread, UNIQUE_RANDOM);
+ }
+ for (size_t i = 0; i < num_db; i++) {
+ auto db = db_list[i];
+ db->Flush(FlushOptions());
+ db->GetColumnFamilyMetaData(&meta);
+ if (num_files_at_level0[i] == meta.levels[0].files.size() ||
+ writes_ == 0) {
+ should_stop = true;
+ continue;
+ }
+ sorted_runs[i].emplace_back(
+ meta.levels[0].files.begin(),
+ meta.levels[0].files.end() - num_files_at_level0[i]);
+ num_files_at_level0[i] = meta.levels[0].files.size();
+ if (sorted_runs[i].back().size() == 1) {
+ should_stop = true;
+ continue;
+ }
+ if (sorted_runs[i].size() == output_level) {
+ auto& L1 = sorted_runs[i].back();
+ L1.erase(L1.begin(), L1.begin() + L1.size() / 3);
+ should_stop = true;
+ continue;
+ }
+ }
+ writes_ /= static_cast<int64_t>(open_options_.max_bytes_for_level_multiplier);
+ }
+ for (size_t i = 0; i < num_db; i++) {
+ if (sorted_runs[i].size() < num_levels - 1) {
+ fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", num_levels);
+ exit(1);
+ }
+ }
+ for (size_t i = 0; i < num_db; i++) {
+ auto db = db_list[i];
+ auto compactionOptions = CompactionOptions();
+ compactionOptions.compression = FLAGS_compression_type_e;
+ auto options = db->GetOptions();
+ MutableCFOptions mutable_cf_options(options);
+ for (size_t j = 0; j < sorted_runs[i].size(); j++) {
+ compactionOptions.output_file_size_limit =
+ MaxFileSizeForLevel(mutable_cf_options,
+ static_cast<int>(output_level), compaction_style);
+ std::cout << sorted_runs[i][j].size() << std::endl;
+ db->CompactFiles(compactionOptions, {sorted_runs[i][j].back().name,
+ sorted_runs[i][j].front().name},
+ static_cast<int>(output_level - j) /*level*/);
+ }
+ }
+ } else if (compaction_style == kCompactionStyleUniversal) {
+ auto ratio = open_options_.compaction_options_universal.size_ratio;
+ bool should_stop = false;
+ while (!should_stop) {
+ if (sorted_runs[0].empty()) {
+ DoWrite(thread, write_mode);
+ } else {
+ DoWrite(thread, UNIQUE_RANDOM);
+ }
+ for (size_t i = 0; i < num_db; i++) {
+ auto db = db_list[i];
+ db->Flush(FlushOptions());
+ db->GetColumnFamilyMetaData(&meta);
+ if (num_files_at_level0[i] == meta.levels[0].files.size() ||
+ writes_ == 0) {
+ should_stop = true;
+ continue;
+ }
+ sorted_runs[i].emplace_back(
+ meta.levels[0].files.begin(),
+ meta.levels[0].files.end() - num_files_at_level0[i]);
+ num_files_at_level0[i] = meta.levels[0].files.size();
+ if (sorted_runs[i].back().size() == 1) {
+ should_stop = true;
+ continue;
+ }
+ num_files_at_level0[i] = meta.levels[0].files.size();
+ }
+ writes_ = static_cast<int64_t>(writes_* static_cast<double>(100) / (ratio + 200));
+ }
+ for (size_t i = 0; i < num_db; i++) {
+ if (sorted_runs[i].size() < num_levels) {
+ fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", num_levels);
+ exit(1);
+ }
+ }
+ for (size_t i = 0; i < num_db; i++) {
+ auto db = db_list[i];
+ auto compactionOptions = CompactionOptions();
+ compactionOptions.compression = FLAGS_compression_type_e;
+ auto options = db->GetOptions();
+ MutableCFOptions mutable_cf_options(options);
+ for (size_t j = 0; j < sorted_runs[i].size(); j++) {
+ compactionOptions.output_file_size_limit =
+ MaxFileSizeForLevel(mutable_cf_options,
+ static_cast<int>(output_level), compaction_style);
+ db->CompactFiles(
+ compactionOptions,
+ {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
+ (output_level > j ? static_cast<int>(output_level - j)
+ : 0) /*level*/);
+ }
+ }
+ } else if (compaction_style == kCompactionStyleFIFO) {
+ if (num_levels != 1) {
+ return Status::InvalidArgument(
+ "num_levels should be 1 for FIFO compaction");
+ }
+ if (FLAGS_num_multi_db != 0) {
+ return Status::InvalidArgument("Doesn't support multiDB");
+ }
+ auto db = db_list[0];
+ std::vector<std::string> file_names;
+ while (true) {
+ if (sorted_runs[0].empty()) {
+ DoWrite(thread, write_mode);
+ } else {
+ DoWrite(thread, UNIQUE_RANDOM);
+ }
+ db->Flush(FlushOptions());
+ db->GetColumnFamilyMetaData(&meta);
+ auto total_size = meta.levels[0].size;
+ if (total_size >=
+ db->GetOptions().compaction_options_fifo.max_table_files_size) {
+ for (auto file_meta : meta.levels[0].files) {
+ file_names.emplace_back(file_meta.name);
+ }
+ break;
+ }
+ }
+ // TODO(shuzhang1989): Investigate why CompactFiles not working
+ // auto compactionOptions = CompactionOptions();
+ // db->CompactFiles(compactionOptions, file_names, 0);
+ auto compactionOptions = CompactRangeOptions();
+ db->CompactRange(compactionOptions, nullptr, nullptr);
+ } else {
+ fprintf(stdout,
+ "%-12s : skipped (-compaction_stype=kCompactionStyleNone)\n",
+ "filldeterministic");
+ return Status::InvalidArgument("None compaction is not supported");
+ }
+
+// Verify seqno and key range
+// Note: the seqno get changed at the max level by implementation
+// optimization, so skip the check of the max level.
+#ifndef NDEBUG
+ for (size_t k = 0; k < num_db; k++) {
+ auto db = db_list[k];
+ db->GetColumnFamilyMetaData(&meta);
+ // verify the number of sorted runs
+ if (compaction_style == kCompactionStyleLevel) {
+ assert(num_levels - 1 == sorted_runs[k].size());
+ } else if (compaction_style == kCompactionStyleUniversal) {
+ assert(meta.levels[0].files.size() + num_levels - 1 ==
+ sorted_runs[k].size());
+ } else if (compaction_style == kCompactionStyleFIFO) {
+ // TODO(gzh): FIFO compaction
+ db->GetColumnFamilyMetaData(&meta);
+ auto total_size = meta.levels[0].size;
+ assert(total_size <=
+ db->GetOptions().compaction_options_fifo.max_table_files_size);
+ break;
+ }
+
+ // verify smallest/largest seqno and key range of each sorted run
+ auto max_level = num_levels - 1;
+ int level;
+ for (size_t i = 0; i < sorted_runs[k].size(); i++) {
+ level = static_cast<int>(max_level - i);
+ SequenceNumber sorted_run_smallest_seqno = kMaxSequenceNumber;
+ SequenceNumber sorted_run_largest_seqno = 0;
+ std::string sorted_run_smallest_key, sorted_run_largest_key;
+ bool first_key = true;
+ for (auto fileMeta : sorted_runs[k][i]) {
+ sorted_run_smallest_seqno =
+ std::min(sorted_run_smallest_seqno, fileMeta.smallest_seqno);
+ sorted_run_largest_seqno =
+ std::max(sorted_run_largest_seqno, fileMeta.largest_seqno);
+ if (first_key ||
+ db->DefaultColumnFamily()->GetComparator()->Compare(
+ fileMeta.smallestkey, sorted_run_smallest_key) < 0) {
+ sorted_run_smallest_key = fileMeta.smallestkey;
+ }
+ if (first_key ||
+ db->DefaultColumnFamily()->GetComparator()->Compare(
+ fileMeta.largestkey, sorted_run_largest_key) > 0) {
+ sorted_run_largest_key = fileMeta.largestkey;
+ }
+ first_key = false;
+ }
+ if (compaction_style == kCompactionStyleLevel ||
+ (compaction_style == kCompactionStyleUniversal && level > 0)) {
+ SequenceNumber level_smallest_seqno = kMaxSequenceNumber;
+ SequenceNumber level_largest_seqno = 0;
+ for (auto fileMeta : meta.levels[level].files) {
+ level_smallest_seqno =
+ std::min(level_smallest_seqno, fileMeta.smallest_seqno);
+ level_largest_seqno =
+ std::max(level_largest_seqno, fileMeta.largest_seqno);
+ }
+ assert(sorted_run_smallest_key ==
+ meta.levels[level].files.front().smallestkey);
+ assert(sorted_run_largest_key ==
+ meta.levels[level].files.back().largestkey);
+ if (level != static_cast<int>(max_level)) {
+ // compaction at max_level would change sequence number
+ assert(sorted_run_smallest_seqno == level_smallest_seqno);
+ assert(sorted_run_largest_seqno == level_largest_seqno);
+ }
+ } else if (compaction_style == kCompactionStyleUniversal) {
+ // level <= 0 means sorted runs on level 0
+ auto level0_file =
+ meta.levels[0].files[sorted_runs[k].size() - 1 - i];
+ assert(sorted_run_smallest_key == level0_file.smallestkey);
+ assert(sorted_run_largest_key == level0_file.largestkey);
+ if (level != static_cast<int>(max_level)) {
+ assert(sorted_run_smallest_seqno == level0_file.smallest_seqno);
+ assert(sorted_run_largest_seqno == level0_file.largest_seqno);
+ }
+ }
+ }
+ }
+#endif
+ // print the size of each sorted_run
+ for (size_t k = 0; k < num_db; k++) {
+ auto db = db_list[k];
+ fprintf(stdout,
+ "---------------------- DB %" ROCKSDB_PRIszt " LSM ---------------------\n", k);
+ db->GetColumnFamilyMetaData(&meta);
+ for (auto& levelMeta : meta.levels) {
+ if (levelMeta.files.empty()) {
+ continue;
+ }
+ if (levelMeta.level == 0) {
+ for (auto& fileMeta : levelMeta.files) {
+ fprintf(stdout, "Level[%d]: %s(size: %" ROCKSDB_PRIszt " bytes)\n",
+ levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
+ }
+ } else {
+ fprintf(stdout, "Level[%d]: %s - %s(total size: %" PRIi64 " bytes)\n",
+ levelMeta.level, levelMeta.files.front().name.c_str(),
+ levelMeta.files.back().name.c_str(), levelMeta.size);
+ }
+ }
+ }
+ for (size_t i = 0; i < num_db; i++) {
+ db_list[i]->SetOptions(
+ {{"disable_auto_compactions",
+ std::to_string(options_list[i].disable_auto_compactions)},
+ {"level0_slowdown_writes_trigger",
+ std::to_string(options_list[i].level0_slowdown_writes_trigger)},
+ {"level0_stop_writes_trigger",
+ std::to_string(options_list[i].level0_stop_writes_trigger)}});
+ }
+ return Status::OK();
+#else
+ (void)thread;
+ (void)compaction_style;
+ (void)write_mode;
+ fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n");
+ return Status::NotSupported(
+ "Rocksdb Lite doesn't support filldeterministic");
+#endif // ROCKSDB_LITE
+ }
+
+ void ReadSequential(ThreadState* thread) {
+ if (db_.db != nullptr) {
+ ReadSequential(thread, db_.db);
+ } else {
+ for (const auto& db_with_cfh : multi_dbs_) {
+ ReadSequential(thread, db_with_cfh.db);
+ }
+ }
+ }
+
+ void ReadSequential(ThreadState* thread, DB* db) {
+ ReadOptions options(FLAGS_verify_checksum, true);
+ options.tailing = FLAGS_use_tailing_iterator;
+
+ Iterator* iter = db->NewIterator(options);
+ int64_t i = 0;
+ int64_t bytes = 0;
+ for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
+ bytes += iter->key().size() + iter->value().size();
+ thread->stats.FinishedOps(nullptr, db, 1, kRead);
+ ++i;
+
+ if (thread->shared->read_rate_limiter.get() != nullptr &&
+ i % 1024 == 1023) {
+ thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
+ nullptr /* stats */,
+ RateLimiter::OpType::kRead);
+ }
+ }
+
+ delete iter;
+ thread->stats.AddBytes(bytes);
+ if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
+ thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+ get_perf_context()->ToString());
+ }
+ }
+
+ void ReadToRowCache(ThreadState* thread) {
+ int64_t read = 0;
+ int64_t found = 0;
+ int64_t bytes = 0;
+ int64_t key_rand = 0;
+ ReadOptions options(FLAGS_verify_checksum, true);
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+ PinnableSlice pinnable_val;
+
+ while (key_rand < FLAGS_num) {
+ DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+ // We use same key_rand as seed for key and column family so that we can
+ // deterministically find the cfh corresponding to a particular key, as it
+ // is done in DoWrite method.
+ GenerateKeyFromInt(key_rand, FLAGS_num, &key);
+ key_rand++;
+ read++;
+ Status s;
+ if (FLAGS_num_column_families > 1) {
+ s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
+ &pinnable_val);
+ } else {
+ pinnable_val.Reset();
+ s = db_with_cfh->db->Get(options,
+ db_with_cfh->db->DefaultColumnFamily(), key,
+ &pinnable_val);
+ }
+
+ if (s.ok()) {
+ found++;
+ bytes += key.size() + pinnable_val.size();
+ } else if (!s.IsNotFound()) {
+ fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
+ abort();
+ }
+
+ if (thread->shared->read_rate_limiter.get() != nullptr &&
+ read % 256 == 255) {
+ thread->shared->read_rate_limiter->Request(
+ 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+ }
+
+ thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
+ }
+
+ char msg[100];
+ snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
+ read);
+
+ thread->stats.AddBytes(bytes);
+ thread->stats.AddMessage(msg);
+
+ if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
+ thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+ get_perf_context()->ToString());
+ }
+ }
+
+ void ReadReverse(ThreadState* thread) {
+ if (db_.db != nullptr) {
+ ReadReverse(thread, db_.db);
+ } else {
+ for (const auto& db_with_cfh : multi_dbs_) {
+ ReadReverse(thread, db_with_cfh.db);
+ }
+ }
+ }
+
+ void ReadReverse(ThreadState* thread, DB* db) {
+ Iterator* iter = db->NewIterator(ReadOptions(FLAGS_verify_checksum, true));
+ int64_t i = 0;
+ int64_t bytes = 0;
+ for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
+ bytes += iter->key().size() + iter->value().size();
+ thread->stats.FinishedOps(nullptr, db, 1, kRead);
+ ++i;
+ if (thread->shared->read_rate_limiter.get() != nullptr &&
+ i % 1024 == 1023) {
+ thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
+ nullptr /* stats */,
+ RateLimiter::OpType::kRead);
+ }
+ }
+ delete iter;
+ thread->stats.AddBytes(bytes);
+ }
+
+ void ReadRandomFast(ThreadState* thread) {
+ int64_t read = 0;
+ int64_t found = 0;
+ int64_t nonexist = 0;
+ ReadOptions options(FLAGS_verify_checksum, true);
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+ std::string value;
+ DB* db = SelectDBWithCfh(thread)->db;
+
+ int64_t pot = 1;
+ while (pot < FLAGS_num) {
+ pot <<= 1;
+ }
+
+ Duration duration(FLAGS_duration, reads_);
+ do {
+ for (int i = 0; i < 100; ++i) {
+ int64_t key_rand = thread->rand.Next() & (pot - 1);
+ GenerateKeyFromInt(key_rand, FLAGS_num, &key);
+ ++read;
+ auto status = db->Get(options, key, &value);
+ if (status.ok()) {
+ ++found;
+ } else if (!status.IsNotFound()) {
+ fprintf(stderr, "Get returned an error: %s\n",
+ status.ToString().c_str());
+ abort();
+ }
+ if (key_rand >= FLAGS_num) {
+ ++nonexist;
+ }
+ }
+ if (thread->shared->read_rate_limiter.get() != nullptr) {
+ thread->shared->read_rate_limiter->Request(
+ 100, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+ }
+
+ thread->stats.FinishedOps(nullptr, db, 100, kRead);
+ } while (!duration.Done(100));
+
+ char msg[100];
+ snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found, "
+ "issued %" PRIu64 " non-exist keys)\n",
+ found, read, nonexist);
+
+ thread->stats.AddMessage(msg);
+
+ if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
+ thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+ get_perf_context()->ToString());
+ }
+ }
+
+ int64_t GetRandomKey(Random64* rand) {
+ uint64_t rand_int = rand->Next();
+ int64_t key_rand;
+ if (read_random_exp_range_ == 0) {
+ key_rand = rand_int % FLAGS_num;
+ } else {
+ const uint64_t kBigInt = static_cast<uint64_t>(1U) << 62;
+ long double order = -static_cast<long double>(rand_int % kBigInt) /
+ static_cast<long double>(kBigInt) *
+ read_random_exp_range_;
+ long double exp_ran = std::exp(order);
+ uint64_t rand_num =
+ static_cast<int64_t>(exp_ran * static_cast<long double>(FLAGS_num));
+ // Map to a different number to avoid locality.
+ const uint64_t kBigPrime = 0x5bd1e995;
+ // Overflow is like %(2^64). Will have little impact of results.
+ key_rand = static_cast<int64_t>((rand_num * kBigPrime) % FLAGS_num);
+ }
+ return key_rand;
+ }
+
+ void ReadRandom(ThreadState* thread) {
+ int64_t read = 0;
+ int64_t found = 0;
+ int64_t bytes = 0;
+ int num_keys = 0;
+ int64_t key_rand = GetRandomKey(&thread->rand);
+ ReadOptions options(FLAGS_verify_checksum, true);
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+ PinnableSlice pinnable_val;
+
+ Duration duration(FLAGS_duration, reads_);
+ while (!duration.Done(1)) {
+ DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+ // We use same key_rand as seed for key and column family so that we can
+ // deterministically find the cfh corresponding to a particular key, as it
+ // is done in DoWrite method.
+ GenerateKeyFromInt(key_rand, FLAGS_num, &key);
+ if (entries_per_batch_ > 1 && FLAGS_multiread_stride) {
+ if (++num_keys == entries_per_batch_) {
+ num_keys = 0;
+ key_rand = GetRandomKey(&thread->rand);
+ if ((key_rand + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
+ FLAGS_num) {
+ key_rand = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
+ }
+ } else {
+ key_rand += FLAGS_multiread_stride;
+ }
+ } else {
+ key_rand = GetRandomKey(&thread->rand);
+ }
+ read++;
+ Status s;
+ if (FLAGS_num_column_families > 1) {
+ s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
+ &pinnable_val);
+ } else {
+ pinnable_val.Reset();
+ s = db_with_cfh->db->Get(options,
+ db_with_cfh->db->DefaultColumnFamily(), key,
+ &pinnable_val);
+ }
+ if (s.ok()) {
+ found++;
+ bytes += key.size() + pinnable_val.size();
+ } else if (!s.IsNotFound()) {
+ fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
+ abort();
+ }
+
+ if (thread->shared->read_rate_limiter.get() != nullptr &&
+ read % 256 == 255) {
+ thread->shared->read_rate_limiter->Request(
+ 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+ }
+
+ thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
+ }
+
+ char msg[100];
+ snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
+ found, read);
+
+ thread->stats.AddBytes(bytes);
+ thread->stats.AddMessage(msg);
+
+ if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
+ thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+ get_perf_context()->ToString());
+ }
+ }
+
+ // Calls MultiGet over a list of keys from a random distribution.
+ // Returns the total number of keys found.
+ void MultiReadRandom(ThreadState* thread) {
+ int64_t read = 0;
+ int64_t num_multireads = 0;
+ int64_t found = 0;
+ ReadOptions options(FLAGS_verify_checksum, true);
+ std::vector<Slice> keys;
+ std::vector<std::unique_ptr<const char[]> > key_guards;
+ std::vector<std::string> values(entries_per_batch_);
+ PinnableSlice* pin_values = new PinnableSlice[entries_per_batch_];
+ std::unique_ptr<PinnableSlice[]> pin_values_guard(pin_values);
+ std::vector<Status> stat_list(entries_per_batch_);
+ while (static_cast<int64_t>(keys.size()) < entries_per_batch_) {
+ key_guards.push_back(std::unique_ptr<const char[]>());
+ keys.push_back(AllocateKey(&key_guards.back()));
+ }
+
+ Duration duration(FLAGS_duration, reads_);
+ while (!duration.Done(1)) {
+ DB* db = SelectDB(thread);
+ if (FLAGS_multiread_stride) {
+ int64_t key = GetRandomKey(&thread->rand);
+ if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
+ static_cast<int64_t>(FLAGS_num)) {
+ key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
+ }
+ for (int64_t i = 0; i < entries_per_batch_; ++i) {
+ GenerateKeyFromInt(key, FLAGS_num, &keys[i]);
+ key += FLAGS_multiread_stride;
+ }
+ } else {
+ for (int64_t i = 0; i < entries_per_batch_; ++i) {
+ GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]);
+ }
+ }
+ if (!FLAGS_multiread_batched) {
+ std::vector<Status> statuses = db->MultiGet(options, keys, &values);
+ assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_);
+
+ read += entries_per_batch_;
+ num_multireads++;
+ for (int64_t i = 0; i < entries_per_batch_; ++i) {
+ if (statuses[i].ok()) {
+ ++found;
+ } else if (!statuses[i].IsNotFound()) {
+ fprintf(stderr, "MultiGet returned an error: %s\n",
+ statuses[i].ToString().c_str());
+ abort();
+ }
+ }
+ } else {
+ db->MultiGet(options, db->DefaultColumnFamily(), keys.size(),
+ keys.data(), pin_values, stat_list.data());
+
+ read += entries_per_batch_;
+ num_multireads++;
+ for (int64_t i = 0; i < entries_per_batch_; ++i) {
+ if (stat_list[i].ok()) {
+ ++found;
+ } else if (!stat_list[i].IsNotFound()) {
+ fprintf(stderr, "MultiGet returned an error: %s\n",
+ stat_list[i].ToString().c_str());
+ abort();
+ }
+ stat_list[i] = Status::OK();
+ pin_values[i].Reset();
+ }
+ }
+ if (thread->shared->read_rate_limiter.get() != nullptr &&
+ num_multireads % 256 == 255) {
+ thread->shared->read_rate_limiter->Request(
+ 256 * entries_per_batch_, Env::IO_HIGH, nullptr /* stats */,
+ RateLimiter::OpType::kRead);
+ }
+ thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kRead);
+ }
+
+ char msg[100];
+ snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
+ found, read);
+ thread->stats.AddMessage(msg);
+ }
+
+ // The inverse function of Pareto distribution
+ int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
+ double ret;
+ if (k == 0.0) {
+ ret = theta - sigma * std::log(u);
+ } else {
+ ret = theta + sigma * (std::pow(u, -1 * k) - 1) / k;
+ }
+ return static_cast<int64_t>(ceil(ret));
+ }
+ // The inverse function of power distribution (y=ax^b)
+ int64_t PowerCdfInversion(double u, double a, double b) {
+ double ret;
+ ret = std::pow((u / a), (1 / b));
+ return static_cast<int64_t>(ceil(ret));
+ }
+
+ // Add the noice to the QPS
+ double AddNoise(double origin, double noise_ratio) {
+ if (noise_ratio < 0.0 || noise_ratio > 1.0) {
+ return origin;
+ }
+ int band_int = static_cast<int>(FLAGS_sine_a);
+ double delta = (rand() % band_int - band_int / 2) * noise_ratio;
+ if (origin + delta < 0) {
+ return origin;
+ } else {
+ return (origin + delta);
+ }
+ }
+
+ // Decide the ratio of different query types
+ // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
+ class QueryDecider {
+ public:
+ std::vector<int> type_;
+ std::vector<double> ratio_;
+ int range_;
+
+ QueryDecider() {}
+ ~QueryDecider() {}
+
+ Status Initiate(std::vector<double> ratio_input) {
+ int range_max = 1000;
+ double sum = 0.0;
+ for (auto& ratio : ratio_input) {
+ sum += ratio;
+ }
+ range_ = 0;
+ for (auto& ratio : ratio_input) {
+ range_ += static_cast<int>(ceil(range_max * (ratio / sum)));
+ type_.push_back(range_);
+ ratio_.push_back(ratio / sum);
+ }
+ return Status::OK();
+ }
+
+ int GetType(int64_t rand_num) {
+ if (rand_num < 0) {
+ rand_num = rand_num * (-1);
+ }
+ assert(range_ != 0);
+ int pos = static_cast<int>(rand_num % range_);
+ for (int i = 0; i < static_cast<int>(type_.size()); i++) {
+ if (pos < type_[i]) {
+ return i;
+ }
+ }
+ return 0;
+ }
+ };
+
+ // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector
+ // to transfer a random value to one keyrange based on the hotness.
+ struct KeyrangeUnit {
+ int64_t keyrange_start;
+ int64_t keyrange_access;
+ int64_t keyrange_keys;
+ };
+
+ // From our observations, the prefix hotness (key-range hotness) follows
+ // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x).
+ // However, we cannot directly use the inverse function to decide a
+ // key-range from a random distribution. To achieve it, we create a list of
+ // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is
+ // decided based on the hotness of the key-range. When a random value is
+ // generated based on uniform distribution, we map it to the KeyrangeUnit Vec
+ // and one KeyrangeUnit is selected. The probability of a KeyrangeUnit being
+ // selected is the same as the hotness of this KeyrangeUnit. After that, the
+ // key can be randomly allocated to the key-range of this KeyrangeUnit, or we
+ // can based on the power distribution (y=ax^b) to generate the offset of
+ // the key in the selected key-range. In this way, we generate the keyID
+ // based on the hotness of the prefix and also the key hotness distribution.
+ class GenerateTwoTermExpKeys {
+ public:
+ int64_t keyrange_rand_max_;
+ int64_t keyrange_size_;
+ int64_t keyrange_num_;
+ bool initiated_;
+ std::vector<KeyrangeUnit> keyrange_set_;
+
+ GenerateTwoTermExpKeys() {
+ keyrange_rand_max_ = FLAGS_num;
+ initiated_ = false;
+ }
+
+ ~GenerateTwoTermExpKeys() {}
+
+ // Initiate the KeyrangeUnit vector and calculate the size of each
+ // KeyrangeUnit.
+ Status InitiateExpDistribution(int64_t total_keys, double prefix_a,
+ double prefix_b, double prefix_c,
+ double prefix_d) {
+ int64_t amplify = 0;
+ int64_t keyrange_start = 0;
+ initiated_ = true;
+ if (FLAGS_keyrange_num <= 0) {
+ keyrange_num_ = 1;
+ } else {
+ keyrange_num_ = FLAGS_keyrange_num;
+ }
+ keyrange_size_ = total_keys / keyrange_num_;
+
+ // Calculate the key-range shares size based on the input parameters
+ for (int64_t pfx = keyrange_num_; pfx >= 1; pfx--) {
+ // Step 1. Calculate the probability that this key range will be
+ // accessed in a query. It is based on the two-term expoential
+ // distribution
+ double keyrange_p = prefix_a * std::exp(prefix_b * pfx) +
+ prefix_c * std::exp(prefix_d * pfx);
+ if (keyrange_p < std::pow(10.0, -16.0)) {
+ keyrange_p = 0.0;
+ }
+ // Step 2. Calculate the amplify
+ // In order to allocate a query to a key-range based on the random
+ // number generated for this query, we need to extend the probability
+ // of each key range from [0,1] to [0, amplify]. Amplify is calculated
+ // by 1/(smallest key-range probability). In this way, we ensure that
+ // all key-ranges are assigned with an Integer that >=0
+ if (amplify == 0 && keyrange_p > 0) {
+ amplify = static_cast<int64_t>(std::floor(1 / keyrange_p)) + 1;
+ }
+
+ // Step 3. For each key-range, we calculate its position in the
+ // [0, amplify] range, including the start, the size (keyrange_access)
+ KeyrangeUnit p_unit;
+ p_unit.keyrange_start = keyrange_start;
+ if (0.0 >= keyrange_p) {
+ p_unit.keyrange_access = 0;
+ } else {
+ p_unit.keyrange_access =
+ static_cast<int64_t>(std::floor(amplify * keyrange_p));
+ }
+ p_unit.keyrange_keys = keyrange_size_;
+ keyrange_set_.push_back(p_unit);
+ keyrange_start += p_unit.keyrange_access;
+ }
+ keyrange_rand_max_ = keyrange_start;
+
+ // Step 4. Shuffle the key-ranges randomly
+ // Since the access probability is calculated from small to large,
+ // If we do not re-allocate them, hot key-ranges are always at the end
+ // and cold key-ranges are at the begin of the key space. Therefore, the
+ // key-ranges are shuffled and the rand seed is only decide by the
+ // key-range hotness distribution. With the same distribution parameters
+ // the shuffle results are the same.
+ Random64 rand_loca(keyrange_rand_max_);
+ for (int64_t i = 0; i < FLAGS_keyrange_num; i++) {
+ int64_t pos = rand_loca.Next() % FLAGS_keyrange_num;
+ assert(i >= 0 && i < static_cast<int64_t>(keyrange_set_.size()) &&
+ pos >= 0 && pos < static_cast<int64_t>(keyrange_set_.size()));
+ std::swap(keyrange_set_[i], keyrange_set_[pos]);
+ }
+
+ // Step 5. Recalculate the prefix start postion after shuffling
+ int64_t offset = 0;
+ for (auto& p_unit : keyrange_set_) {
+ p_unit.keyrange_start = offset;
+ offset += p_unit.keyrange_access;
+ }
+
+ return Status::OK();
+ }
+
+ // Generate the Key ID according to the input ini_rand and key distribution
+ int64_t DistGetKeyID(int64_t ini_rand, double key_dist_a,
+ double key_dist_b) {
+ int64_t keyrange_rand = ini_rand % keyrange_rand_max_;
+
+ // Calculate and select one key-range that contains the new key
+ int64_t start = 0, end = static_cast<int64_t>(keyrange_set_.size());
+ while (start + 1 < end) {
+ int64_t mid = start + (end - start) / 2;
+ assert(mid >= 0 && mid < static_cast<int64_t>(keyrange_set_.size()));
+ if (keyrange_rand < keyrange_set_[mid].keyrange_start) {
+ end = mid;
+ } else {
+ start = mid;
+ }
+ }
+ int64_t keyrange_id = start;
+
+ // Select one key in the key-range and compose the keyID
+ int64_t key_offset = 0, key_seed;
+ if (key_dist_a == 0.0 && key_dist_b == 0.0) {
+ key_offset = ini_rand % keyrange_size_;
+ } else {
+ key_seed = static_cast<int64_t>(
+ ceil(std::pow((ini_rand / key_dist_a), (1 / key_dist_b))));
+ Random64 rand_key(key_seed);
+ key_offset = static_cast<int64_t>(rand_key.Next()) % keyrange_size_;
+ }
+ return keyrange_size_ * keyrange_id + key_offset;
+ }
+ };
+
+ // The social graph wokrload mixed with Get, Put, Iterator queries.
+ // The value size and iterator length follow Pareto distribution.
+ // The overall key access follow power distribution. If user models the
+ // workload based on different key-ranges (or different prefixes), user
+ // can use two-term-exponential distribution to fit the workload. User
+ // needs to decides the ratio between Get, Put, Iterator queries before
+ // starting the benchmark.
+ void MixGraph(ThreadState* thread) {
+ int64_t read = 0; // including single gets and Next of iterators
+ int64_t gets = 0;
+ int64_t puts = 0;
+ int64_t found = 0;
+ int64_t seek = 0;
+ int64_t seek_found = 0;
+ int64_t bytes = 0;
+ const int64_t default_value_max = 1 * 1024 * 1024;
+ int64_t value_max = default_value_max;
+ int64_t scan_len_max = FLAGS_mix_max_scan_len;
+ double write_rate = 1000000.0;
+ double read_rate = 1000000.0;
+ bool use_prefix_modeling = false;
+ GenerateTwoTermExpKeys gen_exp;
+ std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
+ FLAGS_mix_seek_ratio};
+ char value_buffer[default_value_max];
+ QueryDecider query;
+ RandomGenerator gen;
+ Status s;
+ if (value_max > FLAGS_mix_max_value_size) {
+ value_max = FLAGS_mix_max_value_size;
+ }
+
+ ReadOptions options(FLAGS_verify_checksum, true);
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+ PinnableSlice pinnable_val;
+ query.Initiate(ratio);
+
+ // the limit of qps initiation
+ if (FLAGS_sine_a != 0 || FLAGS_sine_d != 0) {
+ thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
+ static_cast<int64_t>(read_rate), 100000 /* refill_period_us */, 10 /* fairness */,
+ RateLimiter::Mode::kReadsOnly));
+ thread->shared->write_rate_limiter.reset(
+ NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
+ }
+
+ // Decide if user wants to use prefix based key generation
+ if (FLAGS_keyrange_dist_a != 0.0 || FLAGS_keyrange_dist_b != 0.0 ||
+ FLAGS_keyrange_dist_c != 0.0 || FLAGS_keyrange_dist_d != 0.0) {
+ use_prefix_modeling = true;
+ gen_exp.InitiateExpDistribution(
+ FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b,
+ FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d);
+ }
+
+ Duration duration(FLAGS_duration, reads_);
+ while (!duration.Done(1)) {
+ DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+ int64_t ini_rand, rand_v, key_rand, key_seed;
+ ini_rand = GetRandomKey(&thread->rand);
+ rand_v = ini_rand % FLAGS_num;
+ double u = static_cast<double>(rand_v) / FLAGS_num;
+
+ // Generate the keyID based on the key hotness and prefix hotness
+ if (use_prefix_modeling) {
+ key_rand =
+ gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b);
+ } else {
+ key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
+ Random64 rand(key_seed);
+ key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
+ }
+ GenerateKeyFromInt(key_rand, FLAGS_num, &key);
+ int query_type = query.GetType(rand_v);
+
+ // change the qps
+ uint64_t now = FLAGS_env->NowMicros();
+ uint64_t usecs_since_last;
+ if (now > thread->stats.GetSineInterval()) {
+ usecs_since_last = now - thread->stats.GetSineInterval();
+ } else {
+ usecs_since_last = 0;
+ }
+
+ if (usecs_since_last >
+ (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) {
+ double usecs_since_start =
+ static_cast<double>(now - thread->stats.GetStart());
+ thread->stats.ResetSineInterval();
+ double mix_rate_with_noise = AddNoise(
+ SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise);
+ read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]);
+ write_rate =
+ mix_rate_with_noise * query.ratio_[1] * FLAGS_mix_ave_kv_size;
+
+ thread->shared->write_rate_limiter.reset(
+ NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
+ thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
+ static_cast<int64_t>(read_rate),
+ FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000}, 10,
+ RateLimiter::Mode::kReadsOnly));
+ }
+ // Start the query
+ if (query_type == 0) {
+ // the Get query
+ gets++;
+ read++;
+ if (FLAGS_num_column_families > 1) {
+ s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
+ &pinnable_val);
+ } else {
+ pinnable_val.Reset();
+ s = db_with_cfh->db->Get(options,
+ db_with_cfh->db->DefaultColumnFamily(), key,
+ &pinnable_val);
+ }
+
+ if (s.ok()) {
+ found++;
+ bytes += key.size() + pinnable_val.size();
+ } else if (!s.IsNotFound()) {
+ fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
+ abort();
+ }
+
+ if (thread->shared->read_rate_limiter.get() != nullptr &&
+ read % 256 == 255) {
+ thread->shared->read_rate_limiter->Request(
+ 256, Env::IO_HIGH, nullptr /* stats */,
+ RateLimiter::OpType::kRead);
+ }
+ thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
+ } else if (query_type == 1) {
+ // the Put query
+ puts++;
+ int64_t val_size = ParetoCdfInversion(
+ u, FLAGS_value_theta, FLAGS_value_k, FLAGS_value_sigma);
+ if (val_size < 0) {
+ val_size = 10;
+ } else if (val_size > value_max) {
+ val_size = val_size % value_max;
+ }
+ s = db_with_cfh->db->Put(
+ write_options_, key,
+ gen.Generate(static_cast<unsigned int>(val_size)));
+ if (!s.ok()) {
+ fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+
+ if (thread->shared->write_rate_limiter) {
+ thread->shared->write_rate_limiter->Request(
+ key.size() + val_size, Env::IO_HIGH, nullptr /*stats*/,
+ RateLimiter::OpType::kWrite);
+ }
+ thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite);
+ } else if (query_type == 2) {
+ // Seek query
+ if (db_with_cfh->db != nullptr) {
+ Iterator* single_iter = nullptr;
+ single_iter = db_with_cfh->db->NewIterator(options);
+ if (single_iter != nullptr) {
+ single_iter->Seek(key);
+ seek++;
+ read++;
+ if (single_iter->Valid() && single_iter->key().compare(key) == 0) {
+ seek_found++;
+ }
+ int64_t scan_length =
+ ParetoCdfInversion(u, FLAGS_iter_theta, FLAGS_iter_k,
+ FLAGS_iter_sigma) %
+ scan_len_max;
+ for (int64_t j = 0; j < scan_length && single_iter->Valid(); j++) {
+ Slice value = single_iter->value();
+ memcpy(value_buffer, value.data(),
+ std::min(value.size(), sizeof(value_buffer)));
+ bytes += single_iter->key().size() + single_iter->value().size();
+ single_iter->Next();
+ assert(single_iter->status().ok());
+ }
+ }
+ delete single_iter;
+ }
+ thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kSeek);
+ }
+ }
+ char msg[256];
+ snprintf(msg, sizeof(msg),
+ "( Gets:%" PRIu64 " Puts:%" PRIu64 " Seek:%" PRIu64 " of %" PRIu64
+ " in %" PRIu64 " found)\n",
+ gets, puts, seek, found, read);
+
+ thread->stats.AddBytes(bytes);
+ thread->stats.AddMessage(msg);
+
+ if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
+ thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+ get_perf_context()->ToString());
+ }
+ }
+
+ void IteratorCreation(ThreadState* thread) {
+ Duration duration(FLAGS_duration, reads_);
+ ReadOptions options(FLAGS_verify_checksum, true);
+ while (!duration.Done(1)) {
+ DB* db = SelectDB(thread);
+ Iterator* iter = db->NewIterator(options);
+ delete iter;
+ thread->stats.FinishedOps(nullptr, db, 1, kOthers);
+ }
+ }
+
+ void IteratorCreationWhileWriting(ThreadState* thread) {
+ if (thread->tid > 0) {
+ IteratorCreation(thread);
+ } else {
+ BGWriter(thread, kWrite);
+ }
+ }
+
+ void SeekRandom(ThreadState* thread) {
+ int64_t read = 0;
+ int64_t found = 0;
+ int64_t bytes = 0;
+ ReadOptions options(FLAGS_verify_checksum, true);
+ options.total_order_seek = FLAGS_total_order_seek;
+ options.prefix_same_as_start = FLAGS_prefix_same_as_start;
+ options.tailing = FLAGS_use_tailing_iterator;
+ options.readahead_size = FLAGS_readahead_size;
+
+ Iterator* single_iter = nullptr;
+ std::vector<Iterator*> multi_iters;
+ if (db_.db != nullptr) {
+ single_iter = db_.db->NewIterator(options);
+ } else {
+ for (const auto& db_with_cfh : multi_dbs_) {
+ multi_iters.push_back(db_with_cfh.db->NewIterator(options));
+ }
+ }
+
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+
+ std::unique_ptr<const char[]> upper_bound_key_guard;
+ Slice upper_bound = AllocateKey(&upper_bound_key_guard);
+ std::unique_ptr<const char[]> lower_bound_key_guard;
+ Slice lower_bound = AllocateKey(&lower_bound_key_guard);
+
+ Duration duration(FLAGS_duration, reads_);
+ char value_buffer[256];
+ while (!duration.Done(1)) {
+ int64_t seek_pos = thread->rand.Next() % FLAGS_num;
+ GenerateKeyFromIntForSeek(static_cast<uint64_t>(seek_pos), FLAGS_num,
+ &key);
+ if (FLAGS_max_scan_distance != 0) {
+ if (FLAGS_reverse_iterator) {
+ GenerateKeyFromInt(
+ static_cast<uint64_t>(std::max(
+ static_cast<int64_t>(0), seek_pos - FLAGS_max_scan_distance)),
+ FLAGS_num, &lower_bound);
+ options.iterate_lower_bound = &lower_bound;
+ } else {
+ auto min_num =
+ std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
+ GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num,
+ &upper_bound);
+ options.iterate_upper_bound = &upper_bound;
+ }
+ }
+
+ if (!FLAGS_use_tailing_iterator) {
+ if (db_.db != nullptr) {
+ delete single_iter;
+ single_iter = db_.db->NewIterator(options);
+ } else {
+ for (auto iter : multi_iters) {
+ delete iter;
+ }
+ multi_iters.clear();
+ for (const auto& db_with_cfh : multi_dbs_) {
+ multi_iters.push_back(db_with_cfh.db->NewIterator(options));
+ }
+ }
+ }
+ // Pick a Iterator to use
+ Iterator* iter_to_use = single_iter;
+ if (single_iter == nullptr) {
+ iter_to_use = multi_iters[thread->rand.Next() % multi_iters.size()];
+ }
+
+ iter_to_use->Seek(key);
+ read++;
+ if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
+ found++;
+ }
+
+ for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) {
+ // Copy out iterator's value to make sure we read them.
+ Slice value = iter_to_use->value();
+ memcpy(value_buffer, value.data(),
+ std::min(value.size(), sizeof(value_buffer)));
+ bytes += iter_to_use->key().size() + iter_to_use->value().size();
+
+ if (!FLAGS_reverse_iterator) {
+ iter_to_use->Next();
+ } else {
+ iter_to_use->Prev();
+ }
+ assert(iter_to_use->status().ok());
+ }
+
+ if (thread->shared->read_rate_limiter.get() != nullptr &&
+ read % 256 == 255) {
+ thread->shared->read_rate_limiter->Request(
+ 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+ }
+
+ thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
+ }
+ delete single_iter;
+ for (auto iter : multi_iters) {
+ delete iter;
+ }
+
+ char msg[100];
+ snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
+ found, read);
+ thread->stats.AddBytes(bytes);
+ thread->stats.AddMessage(msg);
+ if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
+ thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+ get_perf_context()->ToString());
+ }
+ }
+
+ void SeekRandomWhileWriting(ThreadState* thread) {
+ if (thread->tid > 0) {
+ SeekRandom(thread);
+ } else {
+ BGWriter(thread, kWrite);
+ }
+ }
+
+ void SeekRandomWhileMerging(ThreadState* thread) {
+ if (thread->tid > 0) {
+ SeekRandom(thread);
+ } else {
+ BGWriter(thread, kMerge);
+ }
+ }
+
+ void DoDelete(ThreadState* thread, bool seq) {
+ WriteBatch batch;
+ Duration duration(seq ? 0 : FLAGS_duration, deletes_);
+ int64_t i = 0;
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+
+ while (!duration.Done(entries_per_batch_)) {
+ DB* db = SelectDB(thread);
+ batch.Clear();
+ for (int64_t j = 0; j < entries_per_batch_; ++j) {
+ const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
+ GenerateKeyFromInt(k, FLAGS_num, &key);
+ batch.Delete(key);
+ }
+ auto s = db->Write(write_options_, &batch);
+ thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete);
+ if (!s.ok()) {
+ fprintf(stderr, "del error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ i += entries_per_batch_;
+ }
+ }
+
+ void DeleteSeq(ThreadState* thread) {
+ DoDelete(thread, true);
+ }
+
+ void DeleteRandom(ThreadState* thread) {
+ DoDelete(thread, false);
+ }
+
+ void ReadWhileWriting(ThreadState* thread) {
+ if (thread->tid > 0) {
+ ReadRandom(thread);
+ } else {
+ BGWriter(thread, kWrite);
+ }
+ }
+
+ void ReadWhileMerging(ThreadState* thread) {
+ if (thread->tid > 0) {
+ ReadRandom(thread);
+ } else {
+ BGWriter(thread, kMerge);
+ }
+ }
+
+ void BGWriter(ThreadState* thread, enum OperationType write_merge) {
+ // Special thread that keeps writing until other threads are done.
+ RandomGenerator gen;
+ int64_t bytes = 0;
+
+ std::unique_ptr<RateLimiter> write_rate_limiter;
+ if (FLAGS_benchmark_write_rate_limit > 0) {
+ write_rate_limiter.reset(
+ NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
+ }
+
+ // Don't merge stats from this thread with the readers.
+ thread->stats.SetExcludeFromMerge();
+
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+ uint32_t written = 0;
+ bool hint_printed = false;
+
+ while (true) {
+ DB* db = SelectDB(thread);
+ {
+ MutexLock l(&thread->shared->mu);
+ if (FLAGS_finish_after_writes && written == writes_) {
+ fprintf(stderr, "Exiting the writer after %u writes...\n", written);
+ break;
+ }
+ if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
+ // Other threads have finished
+ if (FLAGS_finish_after_writes) {
+ // Wait for the writes to be finished
+ if (!hint_printed) {
+ fprintf(stderr, "Reads are finished. Have %d more writes to do\n",
+ static_cast<int>(writes_) - written);
+ hint_printed = true;
+ }
+ } else {
+ // Finish the write immediately
+ break;
+ }
+ }
+ }
+
+ GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+ Status s;
+
+ Slice val = gen.Generate();
+ if (write_merge == kWrite) {
+ s = db->Put(write_options_, key, val);
+ } else {
+ s = db->Merge(write_options_, key, val);
+ }
+ written++;
+
+ if (!s.ok()) {
+ fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ bytes += key.size() + val.size();
+ thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
+
+ if (FLAGS_benchmark_write_rate_limit > 0) {
+ write_rate_limiter->Request(
+ key.size() + val.size(), Env::IO_HIGH,
+ nullptr /* stats */, RateLimiter::OpType::kWrite);
+ }
+ }
+ thread->stats.AddBytes(bytes);
+ }
+
+ void ReadWhileScanning(ThreadState* thread) {
+ if (thread->tid > 0) {
+ ReadRandom(thread);
+ } else {
+ BGScan(thread);
+ }
+ }
+
+ void BGScan(ThreadState* thread) {
+ if (FLAGS_num_multi_db > 0) {
+ fprintf(stderr, "Not supporting multiple DBs.\n");
+ abort();
+ }
+ assert(db_.db != nullptr);
+ ReadOptions read_options;
+ Iterator* iter = db_.db->NewIterator(read_options);
+
+ fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_);
+ Duration duration(FLAGS_duration, reads_);
+ uint64_t num_seek_to_first = 0;
+ uint64_t num_next = 0;
+ while (!duration.Done(1)) {
+ if (!iter->Valid()) {
+ iter->SeekToFirst();
+ num_seek_to_first++;
+ } else if (!iter->status().ok()) {
+ fprintf(stderr, "Iterator error: %s\n",
+ iter->status().ToString().c_str());
+ abort();
+ } else {
+ iter->Next();
+ num_next++;
+ }
+
+ thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
+ }
+ delete iter;
+ }
+
+ // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
+ // in DB atomically i.e in a single batch. Also refer GetMany.
+ Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,
+ const Slice& value) {
+ std::string suffixes[3] = {"2", "1", "0"};
+ std::string keys[3];
+
+ WriteBatch batch;
+ Status s;
+ for (int i = 0; i < 3; i++) {
+ keys[i] = key.ToString() + suffixes[i];
+ batch.Put(keys[i], value);
+ }
+
+ s = db->Write(writeoptions, &batch);
+ return s;
+ }
+
+
+ // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V)
+ // in DB atomically i.e in a single batch. Also refer GetMany.
+ Status DeleteMany(DB* db, const WriteOptions& writeoptions,
+ const Slice& key) {
+ std::string suffixes[3] = {"1", "2", "0"};
+ std::string keys[3];
+
+ WriteBatch batch;
+ Status s;
+ for (int i = 0; i < 3; i++) {
+ keys[i] = key.ToString() + suffixes[i];
+ batch.Delete(keys[i]);
+ }
+
+ s = db->Write(writeoptions, &batch);
+ return s;
+ }
+
+ // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2"
+ // in the same snapshot, and verifies that all the values are identical.
+ // ASSUMES that PutMany was used to put (K, V) into the DB.
+ Status GetMany(DB* db, const ReadOptions& readoptions, const Slice& key,
+ std::string* value) {
+ std::string suffixes[3] = {"0", "1", "2"};
+ std::string keys[3];
+ Slice key_slices[3];
+ std::string values[3];
+ ReadOptions readoptionscopy = readoptions;
+ readoptionscopy.snapshot = db->GetSnapshot();
+ Status s;
+ for (int i = 0; i < 3; i++) {
+ keys[i] = key.ToString() + suffixes[i];
+ key_slices[i] = keys[i];
+ s = db->Get(readoptionscopy, key_slices[i], value);
+ if (!s.ok() && !s.IsNotFound()) {
+ fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+ values[i] = "";
+ // we continue after error rather than exiting so that we can
+ // find more errors if any
+ } else if (s.IsNotFound()) {
+ values[i] = "";
+ } else {
+ values[i] = *value;
+ }
+ }
+ db->ReleaseSnapshot(readoptionscopy.snapshot);
+
+ if ((values[0] != values[1]) || (values[1] != values[2])) {
+ fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n",
+ key.ToString().c_str(), values[0].c_str(), values[1].c_str(),
+ values[2].c_str());
+ // we continue after error rather than exiting so that we can
+ // find more errors if any
+ }
+
+ return s;
+ }
+
+ // Differs from readrandomwriterandom in the following ways:
+ // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs.
+ // (b) Does deletes as well (per FLAGS_deletepercent)
+ // (c) In order to achieve high % of 'found' during lookups, and to do
+ // multiple writes (including puts and deletes) it uses upto
+ // FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys.
+ // (d) Does not have a MultiGet option.
+ void RandomWithVerify(ThreadState* thread) {
+ ReadOptions options(FLAGS_verify_checksum, true);
+ RandomGenerator gen;
+ std::string value;
+ int64_t found = 0;
+ int get_weight = 0;
+ int put_weight = 0;
+ int delete_weight = 0;
+ int64_t gets_done = 0;
+ int64_t puts_done = 0;
+ int64_t deletes_done = 0;
+
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+
+ // the number of iterations is the larger of read_ or write_
+ for (int64_t i = 0; i < readwrites_; i++) {
+ DB* db = SelectDB(thread);
+ if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
+ // one batch completed, reinitialize for next batch
+ get_weight = FLAGS_readwritepercent;
+ delete_weight = FLAGS_deletepercent;
+ put_weight = 100 - get_weight - delete_weight;
+ }
+ GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
+ FLAGS_numdistinct, &key);
+ if (get_weight > 0) {
+ // do all the gets first
+ Status s = GetMany(db, options, key, &value);
+ if (!s.ok() && !s.IsNotFound()) {
+ fprintf(stderr, "getmany error: %s\n", s.ToString().c_str());
+ // we continue after error rather than exiting so that we can
+ // find more errors if any
+ } else if (!s.IsNotFound()) {
+ found++;
+ }
+ get_weight--;
+ gets_done++;
+ thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
+ } else if (put_weight > 0) {
+ // then do all the corresponding number of puts
+ // for all the gets we have done earlier
+ Status s = PutMany(db, write_options_, key, gen.Generate());
+ if (!s.ok()) {
+ fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ put_weight--;
+ puts_done++;
+ thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
+ } else if (delete_weight > 0) {
+ Status s = DeleteMany(db, write_options_, key);
+ if (!s.ok()) {
+ fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ delete_weight--;
+ deletes_done++;
+ thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
+ }
+ }
+ char msg[128];
+ snprintf(msg, sizeof(msg),
+ "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" \
+ PRIu64 " found:%" PRIu64 ")",
+ gets_done, puts_done, deletes_done, readwrites_, found);
+ thread->stats.AddMessage(msg);
+ }
+
+ // This is different from ReadWhileWriting because it does not use
+ // an extra thread.
+ void ReadRandomWriteRandom(ThreadState* thread) {
+ ReadOptions options(FLAGS_verify_checksum, true);
+ RandomGenerator gen;
+ std::string value;
+ int64_t found = 0;
+ int get_weight = 0;
+ int put_weight = 0;
+ int64_t reads_done = 0;
+ int64_t writes_done = 0;
+ Duration duration(FLAGS_duration, readwrites_);
+
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+
+ // the number of iterations is the larger of read_ or write_
+ while (!duration.Done(1)) {
+ DB* db = SelectDB(thread);
+ GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+ if (get_weight == 0 && put_weight == 0) {
+ // one batch completed, reinitialize for next batch
+ get_weight = FLAGS_readwritepercent;
+ put_weight = 100 - get_weight;
+ }
+ if (get_weight > 0) {
+ // do all the gets first
+ Status s = db->Get(options, key, &value);
+ if (!s.ok() && !s.IsNotFound()) {
+ fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+ // we continue after error rather than exiting so that we can
+ // find more errors if any
+ } else if (!s.IsNotFound()) {
+ found++;
+ }
+ get_weight--;
+ reads_done++;
+ thread->stats.FinishedOps(nullptr, db, 1, kRead);
+ } else if (put_weight > 0) {
+ // then do all the corresponding number of puts
+ // for all the gets we have done earlier
+ Status s = db->Put(write_options_, key, gen.Generate());
+ if (!s.ok()) {
+ fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ put_weight--;
+ writes_done++;
+ thread->stats.FinishedOps(nullptr, db, 1, kWrite);
+ }
+ }
+ char msg[100];
+ snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \
+ " total:%" PRIu64 " found:%" PRIu64 ")",
+ reads_done, writes_done, readwrites_, found);
+ thread->stats.AddMessage(msg);
+ }
+
+ //
+ // Read-modify-write for random keys
+ void UpdateRandom(ThreadState* thread) {
+ ReadOptions options(FLAGS_verify_checksum, true);
+ RandomGenerator gen;
+ std::string value;
+ int64_t found = 0;
+ int64_t bytes = 0;
+ Duration duration(FLAGS_duration, readwrites_);
+
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+ // the number of iterations is the larger of read_ or write_
+ while (!duration.Done(1)) {
+ DB* db = SelectDB(thread);
+ GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+
+ auto status = db->Get(options, key, &value);
+ if (status.ok()) {
+ ++found;
+ bytes += key.size() + value.size();
+ } else if (!status.IsNotFound()) {
+ fprintf(stderr, "Get returned an error: %s\n",
+ status.ToString().c_str());
+ abort();
+ }
+
+ if (thread->shared->write_rate_limiter) {
+ thread->shared->write_rate_limiter->Request(
+ key.size() + value.size(), Env::IO_HIGH, nullptr /*stats*/,
+ RateLimiter::OpType::kWrite);
+ }
+
+ Slice val = gen.Generate();
+ Status s = db->Put(write_options_, key, val);
+ if (!s.ok()) {
+ fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ bytes += key.size() + val.size();
+ thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
+ }
+ char msg[100];
+ snprintf(msg, sizeof(msg),
+ "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
+ thread->stats.AddBytes(bytes);
+ thread->stats.AddMessage(msg);
+ }
+
+ // Read-XOR-write for random keys. Xors the existing value with a randomly
+ // generated value, and stores the result. Assuming A in the array of bytes
+ // representing the existing value, we generate an array B of the same size,
+ // then compute C = A^B as C[i]=A[i]^B[i], and store C
+ void XORUpdateRandom(ThreadState* thread) {
+ ReadOptions options(FLAGS_verify_checksum, true);
+ RandomGenerator gen;
+ std::string existing_value;
+ int64_t found = 0;
+ Duration duration(FLAGS_duration, readwrites_);
+
+ BytesXOROperator xor_operator;
+
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+ // the number of iterations is the larger of read_ or write_
+ while (!duration.Done(1)) {
+ DB* db = SelectDB(thread);
+ GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+
+ auto status = db->Get(options, key, &existing_value);
+ if (status.ok()) {
+ ++found;
+ } else if (!status.IsNotFound()) {
+ fprintf(stderr, "Get returned an error: %s\n",
+ status.ToString().c_str());
+ exit(1);
+ }
+
+ Slice value = gen.Generate(static_cast<unsigned int>(existing_value.size()));
+ std::string new_value;
+
+ if (status.ok()) {
+ Slice existing_value_slice = Slice(existing_value);
+ xor_operator.XOR(&existing_value_slice, value, &new_value);
+ } else {
+ xor_operator.XOR(nullptr, value, &new_value);
+ }
+
+ Status s = db->Put(write_options_, key, Slice(new_value));
+ if (!s.ok()) {
+ fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ thread->stats.FinishedOps(nullptr, db, 1);
+ }
+ char msg[100];
+ snprintf(msg, sizeof(msg),
+ "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
+ thread->stats.AddMessage(msg);
+ }
+
+ // Read-modify-write for random keys.
+ // Each operation causes the key grow by value_size (simulating an append).
+ // Generally used for benchmarking against merges of similar type
+ void AppendRandom(ThreadState* thread) {
+ ReadOptions options(FLAGS_verify_checksum, true);
+ RandomGenerator gen;
+ std::string value;
+ int64_t found = 0;
+ int64_t bytes = 0;
+
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+ // The number of iterations is the larger of read_ or write_
+ Duration duration(FLAGS_duration, readwrites_);
+ while (!duration.Done(1)) {
+ DB* db = SelectDB(thread);
+ GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+
+ auto status = db->Get(options, key, &value);
+ if (status.ok()) {
+ ++found;
+ bytes += key.size() + value.size();
+ } else if (!status.IsNotFound()) {
+ fprintf(stderr, "Get returned an error: %s\n",
+ status.ToString().c_str());
+ abort();
+ } else {
+ // If not existing, then just assume an empty string of data
+ value.clear();
+ }
+
+ // Update the value (by appending data)
+ Slice operand = gen.Generate();
+ if (value.size() > 0) {
+ // Use a delimiter to match the semantics for StringAppendOperator
+ value.append(1,',');
+ }
+ value.append(operand.data(), operand.size());
+
+ // Write back to the database
+ Status s = db->Put(write_options_, key, value);
+ if (!s.ok()) {
+ fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ bytes += key.size() + value.size();
+ thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
+ }
+
+ char msg[100];
+ snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
+ readwrites_, found);
+ thread->stats.AddBytes(bytes);
+ thread->stats.AddMessage(msg);
+ }
+
+ // Read-modify-write for random keys (using MergeOperator)
+ // The merge operator to use should be defined by FLAGS_merge_operator
+ // Adjust FLAGS_value_size so that the keys are reasonable for this operator
+ // Assumes that the merge operator is non-null (i.e.: is well-defined)
+ //
+ // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
+ // to simulate random additions over 64-bit integers using merge.
+ //
+ // The number of merges on the same key can be controlled by adjusting
+ // FLAGS_merge_keys.
+ void MergeRandom(ThreadState* thread) {
+ RandomGenerator gen;
+ int64_t bytes = 0;
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+ // The number of iterations is the larger of read_ or write_
+ Duration duration(FLAGS_duration, readwrites_);
+ while (!duration.Done(1)) {
+ DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+ int64_t key_rand = thread->rand.Next() % merge_keys_;
+ GenerateKeyFromInt(key_rand, merge_keys_, &key);
+
+ Status s;
+ Slice val = gen.Generate();
+ if (FLAGS_num_column_families > 1) {
+ s = db_with_cfh->db->Merge(write_options_,
+ db_with_cfh->GetCfh(key_rand), key,
+ val);
+ } else {
+ s = db_with_cfh->db->Merge(write_options_,
+ db_with_cfh->db->DefaultColumnFamily(), key,
+ val);
+ }
+
+ if (!s.ok()) {
+ fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ bytes += key.size() + val.size();
+ thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
+ }
+
+ // Print some statistics
+ char msg[100];
+ snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
+ thread->stats.AddBytes(bytes);
+ thread->stats.AddMessage(msg);
+ }
+
+ // Read and merge random keys. The amount of reads and merges are controlled
+ // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
+ // keys (and thus also the number of reads and merges on the same key) can be
+ // adjusted with FLAGS_merge_keys.
+ //
+ // As with MergeRandom, the merge operator to use should be defined by
+ // FLAGS_merge_operator.
+ void ReadRandomMergeRandom(ThreadState* thread) {
+ ReadOptions options(FLAGS_verify_checksum, true);
+ RandomGenerator gen;
+ std::string value;
+ int64_t num_hits = 0;
+ int64_t num_gets = 0;
+ int64_t num_merges = 0;
+ size_t max_length = 0;
+
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+ // the number of iterations is the larger of read_ or write_
+ Duration duration(FLAGS_duration, readwrites_);
+ while (!duration.Done(1)) {
+ DB* db = SelectDB(thread);
+ GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
+
+ bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
+
+ if (do_merge) {
+ Status s = db->Merge(write_options_, key, gen.Generate());
+ if (!s.ok()) {
+ fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ num_merges++;
+ thread->stats.FinishedOps(nullptr, db, 1, kMerge);
+ } else {
+ Status s = db->Get(options, key, &value);
+ if (value.length() > max_length)
+ max_length = value.length();
+
+ if (!s.ok() && !s.IsNotFound()) {
+ fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+ // we continue after error rather than exiting so that we can
+ // find more errors if any
+ } else if (!s.IsNotFound()) {
+ num_hits++;
+ }
+ num_gets++;
+ thread->stats.FinishedOps(nullptr, db, 1, kRead);
+ }
+ }
+
+ char msg[100];
+ snprintf(msg, sizeof(msg),
+ "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64
+ " hits:%" PRIu64 " maxlength:%" ROCKSDB_PRIszt ")",
+ num_gets, num_merges, readwrites_, num_hits, max_length);
+ thread->stats.AddMessage(msg);
+ }
+
+ void WriteSeqSeekSeq(ThreadState* thread) {
+ writes_ = FLAGS_num;
+ DoWrite(thread, SEQUENTIAL);
+ // exclude writes from the ops/sec calculation
+ thread->stats.Start(thread->tid);
+
+ DB* db = SelectDB(thread);
+ std::unique_ptr<Iterator> iter(
+ db->NewIterator(ReadOptions(FLAGS_verify_checksum, true)));
+
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+ for (int64_t i = 0; i < FLAGS_num; ++i) {
+ GenerateKeyFromInt(i, FLAGS_num, &key);
+ iter->Seek(key);
+ assert(iter->Valid() && iter->key() == key);
+ thread->stats.FinishedOps(nullptr, db, 1, kSeek);
+
+ for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) {
+ if (!FLAGS_reverse_iterator) {
+ iter->Next();
+ } else {
+ iter->Prev();
+ }
+ GenerateKeyFromInt(++i, FLAGS_num, &key);
+ assert(iter->Valid() && iter->key() == key);
+ thread->stats.FinishedOps(nullptr, db, 1, kSeek);
+ }
+
+ iter->Seek(key);
+ assert(iter->Valid() && iter->key() == key);
+ thread->stats.FinishedOps(nullptr, db, 1, kSeek);
+ }
+ }
+
+ bool binary_search(std::vector<int>& data, int start, int end, int key) {
+ if (data.empty()) return false;
+ if (start > end) return false;
+ int mid = start + (end - start) / 2;
+ if (mid > static_cast<int>(data.size()) - 1) return false;
+ if (data[mid] == key) {
+ return true;
+ } else if (data[mid] > key) {
+ return binary_search(data, start, mid - 1, key);
+ } else {
+ return binary_search(data, mid + 1, end, key);
+ }
+ }
+
+ // Does a bunch of merge operations for a key(key1) where the merge operand
+ // is a sorted list. Next performance comparison is done between doing a Get
+ // for key1 followed by searching for another key(key2) in the large sorted
+ // list vs calling GetMergeOperands for key1 and then searching for the key2
+ // in all the sorted sub-lists. Later case is expected to be a lot faster.
+ void GetMergeOperands(ThreadState* thread) {
+ DB* db = SelectDB(thread);
+ const int kTotalValues = 100000;
+ const int kListSize = 100;
+ std::string key = "my_key";
+ std::string value;
+
+ for (int i = 1; i < kTotalValues; i++) {
+ if (i % kListSize == 0) {
+ // Remove trailing ','
+ value.pop_back();
+ db->Merge(WriteOptions(), key, value);
+ value.clear();
+ } else {
+ value.append(std::to_string(i)).append(",");
+ }
+ }
+
+ SortList s;
+ std::vector<int> data;
+ // This value can be experimented with and it will demonstrate the
+ // perf difference between doing a Get and searching for lookup_key in the
+ // resultant large sorted list vs doing GetMergeOperands and searching
+ // for lookup_key within this resultant sorted sub-lists.
+ int lookup_key = 1;
+
+ // Get API call
+ std::cout << "--- Get API call --- \n";
+ PinnableSlice p_slice;
+ uint64_t st = FLAGS_env->NowNanos();
+ db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice);
+ s.MakeVector(data, p_slice);
+ bool found =
+ binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
+ std::cout << "Found key? " << std::to_string(found) << "\n";
+ uint64_t sp = FLAGS_env->NowNanos();
+ std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n";
+ std::string* dat_ = p_slice.GetSelf();
+ std::cout << "Sample data from Get API call: " << dat_->substr(0, 10)
+ << "\n";
+ data.clear();
+
+ // GetMergeOperands API call
+ std::cout << "--- GetMergeOperands API --- \n";
+ std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1);
+ st = FLAGS_env->NowNanos();
+ int number_of_operands = 0;
+ GetMergeOperandsOptions get_merge_operands_options;
+ get_merge_operands_options.expected_max_number_of_operands =
+ (kTotalValues / 100) + 1;
+ db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key,
+ a_slice.data(), &get_merge_operands_options,
+ &number_of_operands);
+ for (PinnableSlice& psl : a_slice) {
+ s.MakeVector(data, psl);
+ found =
+ binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
+ data.clear();
+ if (found) break;
+ }
+ std::cout << "Found key? " << std::to_string(found) << "\n";
+ sp = FLAGS_env->NowNanos();
+ std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0
+ << " seconds \n";
+ int to_print = 0;
+ std::cout << "Sample data from GetMergeOperands API call: ";
+ for (PinnableSlice& psl : a_slice) {
+ std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n";
+ if (to_print++ > 2) break;
+ }
+ }
+
+#ifndef ROCKSDB_LITE
+ // This benchmark stress tests Transactions. For a given --duration (or
+ // total number of --writes, a Transaction will perform a read-modify-write
+ // to increment the value of a key in each of N(--transaction-sets) sets of
+ // keys (where each set has --num keys). If --threads is set, this will be
+ // done in parallel.
+ //
+ // To test transactions, use --transaction_db=true. Not setting this
+ // parameter
+ // will run the same benchmark without transactions.
+ //
+ // RandomTransactionVerify() will then validate the correctness of the results
+ // by checking if the sum of all keys in each set is the same.
+ void RandomTransaction(ThreadState* thread) {
+ ReadOptions options(FLAGS_verify_checksum, true);
+ Duration duration(FLAGS_duration, readwrites_);
+ ReadOptions read_options(FLAGS_verify_checksum, true);
+ uint16_t num_prefix_ranges = static_cast<uint16_t>(FLAGS_transaction_sets);
+ uint64_t transactions_done = 0;
+
+ if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) {
+ fprintf(stderr, "invalid value for transaction_sets\n");
+ abort();
+ }
+
+ TransactionOptions txn_options;
+ txn_options.lock_timeout = FLAGS_transaction_lock_timeout;
+ txn_options.set_snapshot = FLAGS_transaction_set_snapshot;
+
+ RandomTransactionInserter inserter(&thread->rand, write_options_,
+ read_options, FLAGS_num,
+ num_prefix_ranges);
+
+ if (FLAGS_num_multi_db > 1) {
+ fprintf(stderr,
+ "Cannot run RandomTransaction benchmark with "
+ "FLAGS_multi_db > 1.");
+ abort();
+ }
+
+ while (!duration.Done(1)) {
+ bool success;
+
+ // RandomTransactionInserter will attempt to insert a key for each
+ // # of FLAGS_transaction_sets
+ if (FLAGS_optimistic_transaction_db) {
+ success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db);
+ } else if (FLAGS_transaction_db) {
+ TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db);
+ success = inserter.TransactionDBInsert(txn_db, txn_options);
+ } else {
+ success = inserter.DBInsert(db_.db);
+ }
+
+ if (!success) {
+ fprintf(stderr, "Unexpected error: %s\n",
+ inserter.GetLastStatus().ToString().c_str());
+ abort();
+ }
+
+ thread->stats.FinishedOps(nullptr, db_.db, 1, kOthers);
+ transactions_done++;
+ }
+
+ char msg[100];
+ if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) {
+ snprintf(msg, sizeof(msg),
+ "( transactions:%" PRIu64 " aborts:%" PRIu64 ")",
+ transactions_done, inserter.GetFailureCount());
+ } else {
+ snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done);
+ }
+ thread->stats.AddMessage(msg);
+
+ if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
+ thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+ get_perf_context()->ToString());
+ }
+ thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted()));
+ }
+
+ // Verifies consistency of data after RandomTransaction() has been run.
+ // Since each iteration of RandomTransaction() incremented a key in each set
+ // by the same value, the sum of the keys in each set should be the same.
+ void RandomTransactionVerify() {
+ if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) {
+ // transactions not used, nothing to verify.
+ return;
+ }
+
+ Status s =
+ RandomTransactionInserter::Verify(db_.db,
+ static_cast<uint16_t>(FLAGS_transaction_sets));
+
+ if (s.ok()) {
+ fprintf(stdout, "RandomTransactionVerify Success.\n");
+ } else {
+ fprintf(stdout, "RandomTransactionVerify FAILED!!\n");
+ }
+ }
+#endif // ROCKSDB_LITE
+
+ // Writes and deletes random keys without overwriting keys.
+ //
+ // This benchmark is intended to partially replicate the behavior of MyRocks
+ // secondary indices: All data is stored in keys and updates happen by
+ // deleting the old version of the key and inserting the new version.
+ void RandomReplaceKeys(ThreadState* thread) {
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+ std::vector<uint32_t> counters(FLAGS_numdistinct, 0);
+ size_t max_counter = 50;
+ RandomGenerator gen;
+
+ Status s;
+ DB* db = SelectDB(thread);
+ for (int64_t i = 0; i < FLAGS_numdistinct; i++) {
+ GenerateKeyFromInt(i * max_counter, FLAGS_num, &key);
+ s = db->Put(write_options_, key, gen.Generate());
+ if (!s.ok()) {
+ fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ }
+
+ db->GetSnapshot();
+
+ std::default_random_engine generator;
+ std::normal_distribution<double> distribution(FLAGS_numdistinct / 2.0,
+ FLAGS_stddev);
+ Duration duration(FLAGS_duration, FLAGS_num);
+ while (!duration.Done(1)) {
+ int64_t rnd_id = static_cast<int64_t>(distribution(generator));
+ int64_t key_id = std::max(std::min(FLAGS_numdistinct - 1, rnd_id),
+ static_cast<int64_t>(0));
+ GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
+ &key);
+ s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key)
+ : db->Delete(write_options_, key);
+ if (s.ok()) {
+ counters[key_id] = (counters[key_id] + 1) % max_counter;
+ GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
+ &key);
+ s = db->Put(write_options_, key, Slice());
+ }
+
+ if (!s.ok()) {
+ fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+
+ thread->stats.FinishedOps(nullptr, db, 1, kOthers);
+ }
+
+ char msg[200];
+ snprintf(msg, sizeof(msg),
+ "use single deletes: %d, "
+ "standard deviation: %lf\n",
+ FLAGS_use_single_deletes, FLAGS_stddev);
+ thread->stats.AddMessage(msg);
+ }
+
+ void TimeSeriesReadOrDelete(ThreadState* thread, bool do_deletion) {
+ ReadOptions options(FLAGS_verify_checksum, true);
+ int64_t read = 0;
+ int64_t found = 0;
+ int64_t bytes = 0;
+
+ Iterator* iter = nullptr;
+ // Only work on single database
+ assert(db_.db != nullptr);
+ iter = db_.db->NewIterator(options);
+
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+
+ char value_buffer[256];
+ while (true) {
+ {
+ MutexLock l(&thread->shared->mu);
+ if (thread->shared->num_done >= 1) {
+ // Write thread have finished
+ break;
+ }
+ }
+ if (!FLAGS_use_tailing_iterator) {
+ delete iter;
+ iter = db_.db->NewIterator(options);
+ }
+ // Pick a Iterator to use
+
+ int64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
+ GenerateKeyFromInt(key_id, FLAGS_num, &key);
+ // Reset last 8 bytes to 0
+ char* start = const_cast<char*>(key.data());
+ start += key.size() - 8;
+ memset(start, 0, 8);
+ ++read;
+
+ bool key_found = false;
+ // Seek the prefix
+ for (iter->Seek(key); iter->Valid() && iter->key().starts_with(key);
+ iter->Next()) {
+ key_found = true;
+ // Copy out iterator's value to make sure we read them.
+ if (do_deletion) {
+ bytes += iter->key().size();
+ if (KeyExpired(timestamp_emulator_.get(), iter->key())) {
+ thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
+ db_.db->Delete(write_options_, iter->key());
+ } else {
+ break;
+ }
+ } else {
+ bytes += iter->key().size() + iter->value().size();
+ thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
+ Slice value = iter->value();
+ memcpy(value_buffer, value.data(),
+ std::min(value.size(), sizeof(value_buffer)));
+
+ assert(iter->status().ok());
+ }
+ }
+ found += key_found;
+
+ if (thread->shared->read_rate_limiter.get() != nullptr) {
+ thread->shared->read_rate_limiter->Request(
+ 1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+ }
+ }
+ delete iter;
+
+ char msg[100];
+ snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
+ read);
+ thread->stats.AddBytes(bytes);
+ thread->stats.AddMessage(msg);
+ if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
+ thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+ get_perf_context()->ToString());
+ }
+ }
+
+ void TimeSeriesWrite(ThreadState* thread) {
+ // Special thread that keeps writing until other threads are done.
+ RandomGenerator gen;
+ int64_t bytes = 0;
+
+ // Don't merge stats from this thread with the readers.
+ thread->stats.SetExcludeFromMerge();
+
+ std::unique_ptr<RateLimiter> write_rate_limiter;
+ if (FLAGS_benchmark_write_rate_limit > 0) {
+ write_rate_limiter.reset(
+ NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
+ }
+
+ std::unique_ptr<const char[]> key_guard;
+ Slice key = AllocateKey(&key_guard);
+
+ Duration duration(FLAGS_duration, writes_);
+ while (!duration.Done(1)) {
+ DB* db = SelectDB(thread);
+
+ uint64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
+ // Write key id
+ GenerateKeyFromInt(key_id, FLAGS_num, &key);
+ // Write timestamp
+
+ char* start = const_cast<char*>(key.data());
+ char* pos = start + 8;
+ int bytes_to_fill =
+ std::min(key_size_ - static_cast<int>(pos - start), 8);
+ uint64_t timestamp_value = timestamp_emulator_->Get();
+ if (port::kLittleEndian) {
+ for (int i = 0; i < bytes_to_fill; ++i) {
+ pos[i] = (timestamp_value >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
+ }
+ } else {
+ memcpy(pos, static_cast<void*>(&timestamp_value), bytes_to_fill);
+ }
+
+ timestamp_emulator_->Inc();
+
+ Status s;
+ Slice val = gen.Generate();
+ s = db->Put(write_options_, key, val);
+
+ if (!s.ok()) {
+ fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ bytes = key.size() + val.size();
+ thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
+ thread->stats.AddBytes(bytes);
+
+ if (FLAGS_benchmark_write_rate_limit > 0) {
+ write_rate_limiter->Request(
+ key.size() + val.size(), Env::IO_HIGH,
+ nullptr /* stats */, RateLimiter::OpType::kWrite);
+ }
+ }
+ }
+
+ void TimeSeries(ThreadState* thread) {
+ if (thread->tid > 0) {
+ bool do_deletion = FLAGS_expire_style == "delete" &&
+ thread->tid <= FLAGS_num_deletion_threads;
+ TimeSeriesReadOrDelete(thread, do_deletion);
+ } else {
+ TimeSeriesWrite(thread);
+ thread->stats.Stop();
+ thread->stats.Report("timeseries write");
+ }
+ }
+
+ void Compact(ThreadState* thread) {
+ DB* db = SelectDB(thread);
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction =
+ BottommostLevelCompaction::kForceOptimized;
+ db->CompactRange(cro, nullptr, nullptr);
+ }
+
+ void CompactAll() {
+ if (db_.db != nullptr) {
+ db_.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ }
+ for (const auto& db_with_cfh : multi_dbs_) {
+ db_with_cfh.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ }
+ }
+
+ void ResetStats() {
+ if (db_.db != nullptr) {
+ db_.db->ResetStats();
+ }
+ for (const auto& db_with_cfh : multi_dbs_) {
+ db_with_cfh.db->ResetStats();
+ }
+ }
+
+ void PrintStatsHistory() {
+ if (db_.db != nullptr) {
+ PrintStatsHistoryImpl(db_.db, false);
+ }
+ for (const auto& db_with_cfh : multi_dbs_) {
+ PrintStatsHistoryImpl(db_with_cfh.db, true);
+ }
+ }
+
+ void PrintStatsHistoryImpl(DB* db, bool print_header) {
+ if (print_header) {
+ fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
+ }
+
+ std::unique_ptr<StatsHistoryIterator> shi;
+ Status s = db->GetStatsHistory(0, port::kMaxUint64, &shi);
+ if (!s.ok()) {
+ fprintf(stdout, "%s\n", s.ToString().c_str());
+ return;
+ }
+ assert(shi);
+ while (shi->Valid()) {
+ uint64_t stats_time = shi->GetStatsTime();
+ fprintf(stdout, "------ %s ------\n",
+ TimeToHumanString(static_cast<int>(stats_time)).c_str());
+ for (auto& entry : shi->GetStatsMap()) {
+ fprintf(stdout, " %" PRIu64 " %s %" PRIu64 "\n", stats_time,
+ entry.first.c_str(), entry.second);
+ }
+ shi->Next();
+ }
+ }
+
+ void PrintStats(const char* key) {
+ if (db_.db != nullptr) {
+ PrintStats(db_.db, key, false);
+ }
+ for (const auto& db_with_cfh : multi_dbs_) {
+ PrintStats(db_with_cfh.db, key, true);
+ }
+ }
+
+ void PrintStats(DB* db, const char* key, bool print_header = false) {
+ if (print_header) {
+ fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
+ }
+ std::string stats;
+ if (!db->GetProperty(key, &stats)) {
+ stats = "(failed)";
+ }
+ fprintf(stdout, "\n%s\n", stats.c_str());
+ }
+
+ void Replay(ThreadState* thread) {
+ if (db_.db != nullptr) {
+ Replay(thread, &db_);
+ }
+ }
+
+ void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) {
+ Status s;
+ std::unique_ptr<TraceReader> trace_reader;
+ s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file,
+ &trace_reader);
+ if (!s.ok()) {
+ fprintf(
+ stderr,
+ "Encountered an error creating a TraceReader from the trace file. "
+ "Error: %s\n",
+ s.ToString().c_str());
+ exit(1);
+ }
+ Replayer replayer(db_with_cfh->db, db_with_cfh->cfh,
+ std::move(trace_reader));
+ replayer.SetFastForward(
+ static_cast<uint32_t>(FLAGS_trace_replay_fast_forward));
+ s = replayer.MultiThreadReplay(
+ static_cast<uint32_t>(FLAGS_trace_replay_threads));
+ if (s.ok()) {
+ fprintf(stdout, "Replay started from trace_file: %s\n",
+ FLAGS_trace_file.c_str());
+ } else {
+ fprintf(stderr, "Starting replay failed. Error: %s\n",
+ s.ToString().c_str());
+ }
+ }
+};
+
+int db_bench_tool(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ static bool initialized = false;
+ if (!initialized) {
+ SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+ " [OPTIONS]...");
+ initialized = true;
+ }
+ ParseCommandLineFlags(&argc, &argv, true);
+ FLAGS_compaction_style_e =
+ (ROCKSDB_NAMESPACE::CompactionStyle)FLAGS_compaction_style;
+#ifndef ROCKSDB_LITE
+ if (FLAGS_statistics && !FLAGS_statistics_string.empty()) {
+ fprintf(stderr,
+ "Cannot provide both --statistics and --statistics_string.\n");
+ exit(1);
+ }
+ if (!FLAGS_statistics_string.empty()) {
+ Status s = ObjectRegistry::NewInstance()->NewSharedObject<Statistics>(
+ FLAGS_statistics_string, &dbstats);
+ if (dbstats == nullptr) {
+ fprintf(stderr,
+ "No Statistics registered matching string: %s status=%s\n",
+ FLAGS_statistics_string.c_str(), s.ToString().c_str());
+ exit(1);
+ }
+ }
+#endif // ROCKSDB_LITE
+ if (FLAGS_statistics) {
+ dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ }
+ if (dbstats) {
+ dbstats->set_stats_level(static_cast<StatsLevel>(FLAGS_stats_level));
+ }
+ FLAGS_compaction_pri_e =
+ (ROCKSDB_NAMESPACE::CompactionPri)FLAGS_compaction_pri;
+
+ std::vector<std::string> fanout = ROCKSDB_NAMESPACE::StringSplit(
+ FLAGS_max_bytes_for_level_multiplier_additional, ',');
+ for (size_t j = 0; j < fanout.size(); j++) {
+ FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
+#ifndef CYGWIN
+ std::stoi(fanout[j]));
+#else
+ stoi(fanout[j]));
+#endif
+ }
+
+ FLAGS_compression_type_e =
+ StringToCompressionType(FLAGS_compression_type.c_str());
+
+#ifndef ROCKSDB_LITE
+ FLAGS_blob_db_compression_type_e =
+ StringToCompressionType(FLAGS_blob_db_compression_type.c_str());
+
+ if (!FLAGS_hdfs.empty() && !FLAGS_env_uri.empty()) {
+ fprintf(stderr, "Cannot provide both --hdfs and --env_uri.\n");
+ exit(1);
+ } else if (!FLAGS_env_uri.empty()) {
+ Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env, &env_guard);
+ if (FLAGS_env == nullptr) {
+ fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
+ exit(1);
+ }
+ }
+#endif // ROCKSDB_LITE
+ if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {
+ fprintf(stderr,
+ "`-use_existing_db` must be true for `-use_existing_keys` to be "
+ "settable\n");
+ exit(1);
+ }
+
+ if (!FLAGS_hdfs.empty()) {
+ FLAGS_env = new ROCKSDB_NAMESPACE::HdfsEnv(FLAGS_hdfs);
+ }
+
+ if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE"))
+ FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NONE;
+ else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL"))
+ FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NORMAL;
+ else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL"))
+ FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
+ else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED"))
+ FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::WILLNEED;
+ else {
+ fprintf(stdout, "Unknown compaction fadvice:%s\n",
+ FLAGS_compaction_fadvice.c_str());
+ }
+
+ FLAGS_value_size_distribution_type_e =
+ StringToDistributionType(FLAGS_value_size_distribution_type.c_str());
+
+ FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
+
+ // Note options sanitization may increase thread pool sizes according to
+ // max_background_flushes/max_background_compactions/max_background_jobs
+ FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads,
+ ROCKSDB_NAMESPACE::Env::Priority::HIGH);
+ FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
+ ROCKSDB_NAMESPACE::Env::Priority::BOTTOM);
+ FLAGS_env->SetBackgroundThreads(FLAGS_num_low_pri_threads,
+ ROCKSDB_NAMESPACE::Env::Priority::LOW);
+
+ // Choose a location for the test database if none given with --db=<path>
+ if (FLAGS_db.empty()) {
+ std::string default_db_path;
+ FLAGS_env->GetTestDirectory(&default_db_path);
+ default_db_path += "/dbbench";
+ FLAGS_db = default_db_path;
+ }
+
+ if (FLAGS_stats_interval_seconds > 0) {
+ // When both are set then FLAGS_stats_interval determines the frequency
+ // at which the timer is checked for FLAGS_stats_interval_seconds
+ FLAGS_stats_interval = 1000;
+ }
+
+ if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) {
+ fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n");
+ exit(1);
+ }
+
+ ROCKSDB_NAMESPACE::Benchmark benchmark;
+ benchmark.Run();
+
+#ifndef ROCKSDB_LITE
+ if (FLAGS_print_malloc_stats) {
+ std::string stats_string;
+ ROCKSDB_NAMESPACE::DumpMallocStats(&stats_string);
+ fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str());
+ }
+#endif // ROCKSDB_LITE
+
+ return 0;
+}
+} // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/tools/db_bench_tool_test.cc b/src/rocksdb/tools/db_bench_tool_test.cc
new file mode 100644
index 000000000..821c602bf
--- /dev/null
+++ b/src/rocksdb/tools/db_bench_tool_test.cc
@@ -0,0 +1,320 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db_bench_tool.h"
+#include "options/options_parser.h"
+#include "rocksdb/utilities/options_util.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+#ifdef GFLAGS
+#include "util/gflags_compat.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+static const int kMaxArgCount = 100;
+static const size_t kArgBufferSize = 100000;
+} // namespace
+
+class DBBenchTest : public testing::Test {
+ public:
+ DBBenchTest() : rnd_(0xFB) {
+ test_path_ = test::PerThreadDBPath("db_bench_test");
+ Env::Default()->CreateDir(test_path_);
+ db_path_ = test_path_ + "/db";
+ wal_path_ = test_path_ + "/wal";
+ }
+
+ ~DBBenchTest() {
+ // DestroyDB(db_path_, Options());
+ }
+
+ void ResetArgs() {
+ argc_ = 0;
+ cursor_ = 0;
+ memset(arg_buffer_, 0, kArgBufferSize);
+ }
+
+ void AppendArgs(const std::vector<std::string>& args) {
+ for (const auto& arg : args) {
+ ASSERT_LE(cursor_ + arg.size() + 1, kArgBufferSize);
+ ASSERT_LE(argc_ + 1, kMaxArgCount);
+ snprintf(arg_buffer_ + cursor_, arg.size() + 1, "%s", arg.c_str());
+
+ argv_[argc_++] = arg_buffer_ + cursor_;
+ cursor_ += arg.size() + 1;
+ }
+ }
+
+ void RunDbBench(const std::string& options_file_name) {
+ AppendArgs({"./db_bench", "--benchmarks=fillseq", "--use_existing_db=0",
+ "--num=1000",
+ std::string(std::string("--db=") + db_path_).c_str(),
+ std::string(std::string("--wal_dir=") + wal_path_).c_str(),
+ std::string(std::string("--options_file=") + options_file_name)
+ .c_str()});
+ ASSERT_EQ(0, db_bench_tool(argc(), argv()));
+ }
+
+ void VerifyOptions(const Options& opt) {
+ DBOptions loaded_db_opts;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ ASSERT_OK(LoadLatestOptions(db_path_, FileSystem::Default(),
+ &loaded_db_opts, &cf_descs));
+
+ ASSERT_OK(
+ RocksDBOptionsParser::VerifyDBOptions(DBOptions(opt), loaded_db_opts));
+ ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(ColumnFamilyOptions(opt),
+ cf_descs[0].options));
+
+ // check with the default rocksdb options and expect failure
+ ASSERT_NOK(
+ RocksDBOptionsParser::VerifyDBOptions(DBOptions(), loaded_db_opts));
+ ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(ColumnFamilyOptions(),
+ cf_descs[0].options));
+ }
+
+ char** argv() { return argv_; }
+
+ int argc() { return argc_; }
+
+ std::string db_path_;
+ std::string test_path_;
+ std::string wal_path_;
+
+ char arg_buffer_[kArgBufferSize];
+ char* argv_[kMaxArgCount];
+ int argc_ = 0;
+ int cursor_ = 0;
+ Random rnd_;
+};
+
+namespace {} // namespace
+
+TEST_F(DBBenchTest, OptionsFile) {
+ const std::string kOptionsFileName = test_path_ + "/OPTIONS_test";
+
+ Options opt;
+ opt.create_if_missing = true;
+ opt.max_open_files = 256;
+ opt.max_background_compactions = 10;
+ opt.arena_block_size = 8388608;
+ ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"},
+ {ColumnFamilyOptions(opt)}, kOptionsFileName,
+ Env::Default()));
+
+ // override the following options as db_bench will not take these
+ // options from the options file
+ opt.wal_dir = wal_path_;
+
+ RunDbBench(kOptionsFileName);
+
+ VerifyOptions(opt);
+}
+
+TEST_F(DBBenchTest, OptionsFileUniversal) {
+ const std::string kOptionsFileName = test_path_ + "/OPTIONS_test";
+
+ Options opt;
+ opt.compaction_style = kCompactionStyleUniversal;
+ opt.num_levels = 1;
+ opt.create_if_missing = true;
+ opt.max_open_files = 256;
+ opt.max_background_compactions = 10;
+ opt.arena_block_size = 8388608;
+ ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"},
+ {ColumnFamilyOptions(opt)}, kOptionsFileName,
+ Env::Default()));
+
+ // override the following options as db_bench will not take these
+ // options from the options file
+ opt.wal_dir = wal_path_;
+
+ RunDbBench(kOptionsFileName);
+
+ VerifyOptions(opt);
+}
+
+TEST_F(DBBenchTest, OptionsFileMultiLevelUniversal) {
+ const std::string kOptionsFileName = test_path_ + "/OPTIONS_test";
+
+ Options opt;
+ opt.compaction_style = kCompactionStyleUniversal;
+ opt.num_levels = 12;
+ opt.create_if_missing = true;
+ opt.max_open_files = 256;
+ opt.max_background_compactions = 10;
+ opt.arena_block_size = 8388608;
+ ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"},
+ {ColumnFamilyOptions(opt)}, kOptionsFileName,
+ Env::Default()));
+
+ // override the following options as db_bench will not take these
+ // options from the options file
+ opt.wal_dir = wal_path_;
+
+ RunDbBench(kOptionsFileName);
+
+ VerifyOptions(opt);
+}
+
+const std::string options_file_content = R"OPTIONS_FILE(
+[Version]
+ rocksdb_version=4.3.1
+ options_file_version=1.1
+
+[DBOptions]
+ wal_bytes_per_sync=1048576
+ delete_obsolete_files_period_micros=0
+ WAL_ttl_seconds=0
+ WAL_size_limit_MB=0
+ db_write_buffer_size=0
+ max_subcompactions=1
+ table_cache_numshardbits=4
+ max_open_files=-1
+ max_file_opening_threads=10
+ max_background_compactions=5
+ use_fsync=false
+ use_adaptive_mutex=false
+ max_total_wal_size=18446744073709551615
+ compaction_readahead_size=0
+ new_table_reader_for_compaction_inputs=false
+ keep_log_file_num=10
+ skip_stats_update_on_db_open=false
+ max_manifest_file_size=18446744073709551615
+ db_log_dir=
+ skip_log_error_on_recovery=false
+ writable_file_max_buffer_size=1048576
+ paranoid_checks=true
+ is_fd_close_on_exec=true
+ bytes_per_sync=1048576
+ enable_thread_tracking=true
+ recycle_log_file_num=0
+ create_missing_column_families=false
+ log_file_time_to_roll=0
+ max_background_flushes=1
+ create_if_missing=true
+ error_if_exists=false
+ delayed_write_rate=1048576
+ manifest_preallocation_size=4194304
+ allow_mmap_reads=false
+ allow_mmap_writes=false
+ use_direct_reads=false
+ use_direct_io_for_flush_and_compaction=false
+ stats_dump_period_sec=600
+ allow_fallocate=true
+ max_log_file_size=83886080
+ random_access_max_buffer_size=1048576
+ advise_random_on_open=true
+
+
+[CFOptions "default"]
+ compaction_filter_factory=nullptr
+ table_factory=BlockBasedTable
+ prefix_extractor=nullptr
+ comparator=leveldb.BytewiseComparator
+ compression_per_level=
+ max_bytes_for_level_base=104857600
+ bloom_locality=0
+ target_file_size_base=10485760
+ memtable_huge_page_size=0
+ max_successive_merges=1000
+ max_sequential_skip_in_iterations=8
+ arena_block_size=52428800
+ target_file_size_multiplier=1
+ source_compaction_factor=1
+ min_write_buffer_number_to_merge=1
+ max_write_buffer_number=2
+ write_buffer_size=419430400
+ max_grandparent_overlap_factor=10
+ max_bytes_for_level_multiplier=10
+ memtable_factory=SkipListFactory
+ compression=kSnappyCompression
+ min_partial_merge_operands=2
+ level0_stop_writes_trigger=100
+ num_levels=1
+ level0_slowdown_writes_trigger=50
+ level0_file_num_compaction_trigger=10
+ expanded_compaction_factor=25
+ soft_rate_limit=0.000000
+ max_write_buffer_number_to_maintain=0
+ max_write_buffer_size_to_maintain=0
+ verify_checksums_in_compaction=true
+ merge_operator=nullptr
+ memtable_prefix_bloom_bits=0
+ memtable_whole_key_filtering=true
+ paranoid_file_checks=false
+ inplace_update_num_locks=10000
+ optimize_filters_for_hits=false
+ level_compaction_dynamic_level_bytes=false
+ inplace_update_support=false
+ compaction_style=kCompactionStyleUniversal
+ memtable_prefix_bloom_probes=6
+ purge_redundant_kvs_while_flush=true
+ filter_deletes=false
+ hard_pending_compaction_bytes_limit=0
+ disable_auto_compactions=false
+ compaction_measure_io_stats=false
+
+[TableOptions/BlockBasedTable "default"]
+ format_version=0
+ skip_table_builder_flush=false
+ cache_index_and_filter_blocks=false
+ flush_block_policy_factory=FlushBlockBySizePolicyFactory
+ hash_index_allow_collision=true
+ index_type=kBinarySearch
+ whole_key_filtering=true
+ checksum=kCRC32c
+ no_block_cache=false
+ block_size=32768
+ block_size_deviation=10
+ block_restart_interval=16
+ filter_policy=rocksdb.BuiltinBloomFilter
+)OPTIONS_FILE";
+
+TEST_F(DBBenchTest, OptionsFileFromFile) {
+ const std::string kOptionsFileName = test_path_ + "/OPTIONS_flash";
+ std::unique_ptr<WritableFile> writable;
+ ASSERT_OK(Env::Default()->NewWritableFile(kOptionsFileName, &writable,
+ EnvOptions()));
+ ASSERT_OK(writable->Append(options_file_content));
+ ASSERT_OK(writable->Close());
+
+ DBOptions db_opt;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ ASSERT_OK(LoadOptionsFromFile(kOptionsFileName, Env::Default(), &db_opt,
+ &cf_descs));
+ Options opt(db_opt, cf_descs[0].options);
+
+ opt.create_if_missing = true;
+
+ // override the following options as db_bench will not take these
+ // options from the options file
+ opt.wal_dir = wal_path_;
+
+ RunDbBench(kOptionsFileName);
+
+ VerifyOptions(opt);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ google::ParseCommandLineFlags(&argc, &argv, true);
+ return RUN_ALL_TESTS();
+}
+
+#else
+
+int main(int argc, char** argv) {
+ printf("Skip db_bench_tool_test as the required library GFLAG is missing.");
+}
+#endif // #ifdef GFLAGS
diff --git a/src/rocksdb/tools/db_crashtest.py b/src/rocksdb/tools/db_crashtest.py
new file mode 100644
index 000000000..bf690b1ec
--- /dev/null
+++ b/src/rocksdb/tools/db_crashtest.py
@@ -0,0 +1,499 @@
+#!/usr/bin/env python2
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import os
+import sys
+import time
+import random
+import tempfile
+import subprocess
+import shutil
+import argparse
+
+# params overwrite priority:
+# for default:
+# default_params < {blackbox,whitebox}_default_params < args
+# for simple:
+# default_params < {blackbox,whitebox}_default_params <
+# simple_default_params <
+# {blackbox,whitebox}_simple_default_params < args
+# for cf_consistency:
+# default_params < {blackbox,whitebox}_default_params <
+# cf_consistency_params < args
+# for txn:
+# default_params < {blackbox,whitebox}_default_params < txn_params < args
+
+expected_values_file = tempfile.NamedTemporaryFile()
+
+default_params = {
+ "acquire_snapshot_one_in": 10000,
+ "block_size": 16384,
+ "bloom_bits": lambda: random.choice([random.randint(0,19),
+ random.lognormvariate(2.3, 1.3)]),
+ "cache_index_and_filter_blocks": lambda: random.randint(0, 1),
+ "cache_size": 1048576,
+ "checkpoint_one_in": 1000000,
+ "compression_type": lambda: random.choice(
+ ["none", "snappy", "zlib", "bzip2", "lz4", "lz4hc", "xpress", "zstd"]),
+ "bottommost_compression_type": lambda:
+ "disable" if random.randint(0, 1) == 0 else
+ random.choice(
+ ["none", "snappy", "zlib", "bzip2", "lz4", "lz4hc", "xpress",
+ "zstd"]),
+ "checksum_type" : lambda: random.choice(["kCRC32c", "kxxHash", "kxxHash64"]),
+ "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1),
+ "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1),
+ "clear_column_family_one_in": 0,
+ "compact_files_one_in": 1000000,
+ "compact_range_one_in": 1000000,
+ "delpercent": 4,
+ "delrangepercent": 1,
+ "destroy_db_initially": 0,
+ "enable_pipelined_write": lambda: random.randint(0, 1),
+ "expected_values_path": expected_values_file.name,
+ "flush_one_in": 1000000,
+ "get_live_files_and_wal_files_one_in": 1000000,
+ # Temporarily disable hash index
+ "index_type": lambda: random.choice([0,2]),
+ "max_background_compactions": 20,
+ "max_bytes_for_level_base": 10485760,
+ "max_key": 100000000,
+ "max_write_buffer_number": 3,
+ "mmap_read": lambda: random.randint(0, 1),
+ "nooverwritepercent": 1,
+ "open_files": lambda : random.choice([-1, 500000]),
+ "partition_filters": lambda: random.randint(0, 1),
+ "pause_background_one_in": 1000000,
+ "prefixpercent": 5,
+ "progress_reports": 0,
+ "readpercent": 45,
+ "recycle_log_file_num": lambda: random.randint(0, 1),
+ "reopen": 20,
+ "snapshot_hold_ops": 100000,
+ "long_running_snapshots": lambda: random.randint(0, 1),
+ "subcompactions": lambda: random.randint(1, 4),
+ "target_file_size_base": 2097152,
+ "target_file_size_multiplier": 2,
+ "use_direct_reads": lambda: random.randint(0, 1),
+ "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
+ "use_full_merge_v1": lambda: random.randint(0, 1),
+ "use_merge": lambda: random.randint(0, 1),
+ "verify_checksum": 1,
+ "write_buffer_size": 4 * 1024 * 1024,
+ "writepercent": 35,
+ "format_version": lambda: random.choice([2, 3, 4, 5, 5]),
+ "index_block_restart_interval": lambda: random.choice(range(1, 16)),
+ "use_multiget" : lambda: random.randint(0, 1),
+ "periodic_compaction_seconds" :
+ lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
+ "compaction_ttl" : lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
+ # Test small max_manifest_file_size in a smaller chance, as most of the
+ # time we wnat manifest history to be preserved to help debug
+ "max_manifest_file_size" : lambda : random.choice(
+ [t * 16384 if t < 3 else 1024 * 1024 * 1024 for t in range(1, 30)]),
+ # Sync mode might make test runs slower so running it in a smaller chance
+ "sync" : lambda : random.choice(
+ [1 if t == 0 else 0 for t in range(0, 20)]),
+ # Disable compation_readahead_size because the test is not passing.
+ #"compaction_readahead_size" : lambda : random.choice(
+ # [0, 0, 1024 * 1024]),
+ "db_write_buffer_size" : lambda: random.choice(
+ [0, 0, 0, 1024 * 1024, 8 * 1024 * 1024, 128 * 1024 * 1024]),
+ "avoid_unnecessary_blocking_io" : random.randint(0, 1),
+ "write_dbid_to_manifest" : random.randint(0, 1),
+ "max_write_batch_group_size_bytes" : lambda: random.choice(
+ [16, 64, 1024 * 1024, 16 * 1024 * 1024]),
+ "level_compaction_dynamic_level_bytes" : True,
+ "verify_checksum_one_in": 1000000,
+ "verify_db_one_in": 100000,
+ "continuous_verification_interval" : 0,
+ "max_key_len": 3,
+ "key_len_percent_dist": "1,30,69"
+}
+
+_TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
+
+
+def get_dbname(test_name):
+ test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
+ if test_tmpdir is None or test_tmpdir == "":
+ dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_' + test_name)
+ else:
+ dbname = test_tmpdir + "/rocksdb_crashtest_" + test_name
+ shutil.rmtree(dbname, True)
+ os.mkdir(dbname)
+ return dbname
+
+
+def is_direct_io_supported(dbname):
+ with tempfile.NamedTemporaryFile(dir=dbname) as f:
+ try:
+ os.open(f.name, os.O_DIRECT)
+ except:
+ return False
+ return True
+
+
+blackbox_default_params = {
+ # total time for this script to test db_stress
+ "duration": 6000,
+ # time for one db_stress instance to run
+ "interval": 120,
+ # since we will be killing anyway, use large value for ops_per_thread
+ "ops_per_thread": 100000000,
+ "set_options_one_in": 10000,
+ "test_batches_snapshots": 1,
+}
+
+whitebox_default_params = {
+ "duration": 10000,
+ "log2_keys_per_lock": 10,
+ "ops_per_thread": 200000,
+ "random_kill_odd": 888887,
+ "test_batches_snapshots": lambda: random.randint(0, 1),
+}
+
+simple_default_params = {
+ "allow_concurrent_memtable_write": lambda: random.randint(0, 1),
+ "column_families": 1,
+ "max_background_compactions": 1,
+ "max_bytes_for_level_base": 67108864,
+ "memtablerep": "skip_list",
+ "prefixpercent": 0,
+ "readpercent": 50,
+ "prefix_size" : -1,
+ "target_file_size_base": 16777216,
+ "target_file_size_multiplier": 1,
+ "test_batches_snapshots": 0,
+ "write_buffer_size": 32 * 1024 * 1024,
+ "level_compaction_dynamic_level_bytes": False,
+}
+
+blackbox_simple_default_params = {
+ "open_files": -1,
+ "set_options_one_in": 0,
+}
+
+whitebox_simple_default_params = {}
+
+cf_consistency_params = {
+ "disable_wal": lambda: random.randint(0, 1),
+ "reopen": 0,
+ "test_cf_consistency": 1,
+ # use small value for write_buffer_size so that RocksDB triggers flush
+ # more frequently
+ "write_buffer_size": 1024 * 1024,
+ "enable_pipelined_write": lambda: random.randint(0, 1),
+}
+
+txn_params = {
+ "use_txn" : 1,
+ # Avoid lambda to set it once for the entire test
+ "txn_write_policy": random.randint(0, 2),
+ "unordered_write": random.randint(0, 1),
+ "disable_wal": 0,
+ # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
+ "checkpoint_one_in": 0,
+ # pipeline write is not currnetly compatible with WritePrepared txns
+ "enable_pipelined_write": 0,
+}
+
+def finalize_and_sanitize(src_params):
+ dest_params = dict([(k, v() if callable(v) else v)
+ for (k, v) in src_params.items()])
+ if dest_params.get("compression_type") != "zstd" or \
+ dest_params.get("compression_max_dict_bytes") == 0:
+ dest_params["compression_zstd_max_train_bytes"] = 0
+ if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
+ dest_params["memtablerep"] = "skip_list"
+ if dest_params["mmap_read"] == 1 or not is_direct_io_supported(
+ dest_params["db"]):
+ dest_params["use_direct_io_for_flush_and_compaction"] = 0
+ dest_params["use_direct_reads"] = 0
+ # DeleteRange is not currnetly compatible with Txns
+ if dest_params.get("test_batches_snapshots") == 1 or \
+ dest_params.get("use_txn") == 1:
+ dest_params["delpercent"] += dest_params["delrangepercent"]
+ dest_params["delrangepercent"] = 0
+ # Only under WritePrepared txns, unordered_write would provide the same guarnatees as vanilla rocksdb
+ if dest_params.get("unordered_write", 0) == 1:
+ dest_params["txn_write_policy"] = 1
+ dest_params["allow_concurrent_memtable_write"] = 1
+ if dest_params.get("disable_wal", 0) == 1:
+ dest_params["atomic_flush"] = 1
+ dest_params["sync"] = 0
+ if dest_params.get("open_files", 1) != -1:
+ # Compaction TTL and periodic compactions are only compatible
+ # with open_files = -1
+ dest_params["compaction_ttl"] = 0
+ dest_params["periodic_compaction_seconds"] = 0
+ if dest_params.get("compaction_style", 0) == 2:
+ # Disable compaction TTL in FIFO compaction, because right
+ # now assertion failures are triggered.
+ dest_params["compaction_ttl"] = 0
+ dest_params["periodic_compaction_seconds"] = 0
+ if dest_params["partition_filters"] == 1:
+ if dest_params["index_type"] != 2:
+ dest_params["partition_filters"] = 0
+ else:
+ dest_params["use_block_based_filter"] = 0
+ if dest_params.get("atomic_flush", 0) == 1:
+ # disable pipelined write when atomic flush is used.
+ dest_params["enable_pipelined_write"] = 0
+ return dest_params
+
+def gen_cmd_params(args):
+ params = {}
+
+ params.update(default_params)
+ if args.test_type == 'blackbox':
+ params.update(blackbox_default_params)
+ if args.test_type == 'whitebox':
+ params.update(whitebox_default_params)
+ if args.simple:
+ params.update(simple_default_params)
+ if args.test_type == 'blackbox':
+ params.update(blackbox_simple_default_params)
+ if args.test_type == 'whitebox':
+ params.update(whitebox_simple_default_params)
+ if args.cf_consistency:
+ params.update(cf_consistency_params)
+ if args.txn:
+ params.update(txn_params)
+
+ for k, v in vars(args).items():
+ if v is not None:
+ params[k] = v
+ return params
+
+
+def gen_cmd(params, unknown_params):
+ finalzied_params = finalize_and_sanitize(params)
+ cmd = ['./db_stress'] + [
+ '--{0}={1}'.format(k, v)
+ for k, v in [(k, finalzied_params[k]) for k in sorted(finalzied_params)]
+ if k not in set(['test_type', 'simple', 'duration', 'interval',
+ 'random_kill_odd', 'cf_consistency', 'txn'])
+ and v is not None] + unknown_params
+ return cmd
+
+
+# This script runs and kills db_stress multiple times. It checks consistency
+# in case of unsafe crashes in RocksDB.
+def blackbox_crash_main(args, unknown_args):
+ cmd_params = gen_cmd_params(args)
+ dbname = get_dbname('blackbox')
+ exit_time = time.time() + cmd_params['duration']
+
+ print("Running blackbox-crash-test with \n"
+ + "interval_between_crash=" + str(cmd_params['interval']) + "\n"
+ + "total-duration=" + str(cmd_params['duration']) + "\n")
+
+ while time.time() < exit_time:
+ run_had_errors = False
+ killtime = time.time() + cmd_params['interval']
+
+ cmd = gen_cmd(dict(
+ cmd_params.items() +
+ {'db': dbname}.items()), unknown_args)
+
+ child = subprocess.Popen(cmd, stderr=subprocess.PIPE)
+ print("Running db_stress with pid=%d: %s\n\n"
+ % (child.pid, ' '.join(cmd)))
+
+ stop_early = False
+ while time.time() < killtime:
+ if child.poll() is not None:
+ print("WARNING: db_stress ended before kill: exitcode=%d\n"
+ % child.returncode)
+ stop_early = True
+ break
+ time.sleep(1)
+
+ if not stop_early:
+ if child.poll() is not None:
+ print("WARNING: db_stress ended before kill: exitcode=%d\n"
+ % child.returncode)
+ else:
+ child.kill()
+ print("KILLED %d\n" % child.pid)
+ time.sleep(1) # time to stabilize after a kill
+
+ while True:
+ line = child.stderr.readline().strip()
+ if line == '':
+ break
+ elif not line.startswith('WARNING'):
+ run_had_errors = True
+ print('stderr has error message:')
+ print('***' + line + '***')
+
+ if run_had_errors:
+ sys.exit(2)
+
+ time.sleep(1) # time to stabilize before the next run
+
+ # we need to clean up after ourselves -- only do this on test success
+ shutil.rmtree(dbname, True)
+
+
+# This python script runs db_stress multiple times. Some runs with
+# kill_random_test that causes rocksdb to crash at various points in code.
+def whitebox_crash_main(args, unknown_args):
+ cmd_params = gen_cmd_params(args)
+ dbname = get_dbname('whitebox')
+
+ cur_time = time.time()
+ exit_time = cur_time + cmd_params['duration']
+ half_time = cur_time + cmd_params['duration'] / 2
+
+ print("Running whitebox-crash-test with \n"
+ + "total-duration=" + str(cmd_params['duration']) + "\n")
+
+ total_check_mode = 4
+ check_mode = 0
+ kill_random_test = cmd_params['random_kill_odd']
+ kill_mode = 0
+
+ while time.time() < exit_time:
+ if check_mode == 0:
+ additional_opts = {
+ # use large ops per thread since we will kill it anyway
+ "ops_per_thread": 100 * cmd_params['ops_per_thread'],
+ }
+ # run with kill_random_test, with three modes.
+ # Mode 0 covers all kill points. Mode 1 covers less kill points but
+ # increases change of triggering them. Mode 2 covers even less
+ # frequent kill points and further increases triggering change.
+ if kill_mode == 0:
+ additional_opts.update({
+ "kill_random_test": kill_random_test,
+ })
+ elif kill_mode == 1:
+ if cmd_params.get('disable_wal', 0) == 1:
+ my_kill_odd = kill_random_test / 50 + 1
+ else:
+ my_kill_odd = kill_random_test / 10 + 1
+ additional_opts.update({
+ "kill_random_test": my_kill_odd,
+ "kill_prefix_blacklist": "WritableFileWriter::Append,"
+ + "WritableFileWriter::WriteBuffered",
+ })
+ elif kill_mode == 2:
+ # TODO: May need to adjust random odds if kill_random_test
+ # is too small.
+ additional_opts.update({
+ "kill_random_test": (kill_random_test / 5000 + 1),
+ "kill_prefix_blacklist": "WritableFileWriter::Append,"
+ "WritableFileWriter::WriteBuffered,"
+ "PosixMmapFile::Allocate,WritableFileWriter::Flush",
+ })
+ # Run kill mode 0, 1 and 2 by turn.
+ kill_mode = (kill_mode + 1) % 3
+ elif check_mode == 1:
+ # normal run with universal compaction mode
+ additional_opts = {
+ "kill_random_test": None,
+ "ops_per_thread": cmd_params['ops_per_thread'],
+ "compaction_style": 1,
+ }
+ elif check_mode == 2:
+ # normal run with FIFO compaction mode
+ # ops_per_thread is divided by 5 because FIFO compaction
+ # style is quite a bit slower on reads with lot of files
+ additional_opts = {
+ "kill_random_test": None,
+ "ops_per_thread": cmd_params['ops_per_thread'] / 5,
+ "compaction_style": 2,
+ }
+ else:
+ # normal run
+ additional_opts = {
+ "kill_random_test": None,
+ "ops_per_thread": cmd_params['ops_per_thread'],
+ }
+
+ cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()
+ + {'db': dbname}.items()), unknown_args)
+
+ print "Running:" + ' '.join(cmd) + "\n" # noqa: E999 T25377293 Grandfathered in
+
+ popen = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT)
+ stdoutdata, stderrdata = popen.communicate()
+ retncode = popen.returncode
+ msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
+ check_mode, additional_opts['kill_random_test'], retncode))
+ print msg
+ print stdoutdata
+
+ expected = False
+ if additional_opts['kill_random_test'] is None and (retncode == 0):
+ # we expect zero retncode if no kill option
+ expected = True
+ elif additional_opts['kill_random_test'] is not None and retncode <= 0:
+ # When kill option is given, the test MIGHT kill itself.
+ # If it does, negative retncode is expected. Otherwise 0.
+ expected = True
+
+ if not expected:
+ print "TEST FAILED. See kill option and exit code above!!!\n"
+ sys.exit(1)
+
+ stdoutdata = stdoutdata.lower()
+ errorcount = (stdoutdata.count('error') -
+ stdoutdata.count('got errors 0 times'))
+ print "#times error occurred in output is " + str(errorcount) + "\n"
+
+ if (errorcount > 0):
+ print "TEST FAILED. Output has 'error'!!!\n"
+ sys.exit(2)
+ if (stdoutdata.find('fail') >= 0):
+ print "TEST FAILED. Output has 'fail'!!!\n"
+ sys.exit(2)
+
+ # First half of the duration, keep doing kill test. For the next half,
+ # try different modes.
+ if time.time() > half_time:
+ # we need to clean up after ourselves -- only do this on test
+ # success
+ shutil.rmtree(dbname, True)
+ os.mkdir(dbname)
+ cmd_params.pop('expected_values_path', None)
+ check_mode = (check_mode + 1) % total_check_mode
+
+ time.sleep(1) # time to stabilize after a kill
+
+
+def main():
+ parser = argparse.ArgumentParser(description="This script runs and kills \
+ db_stress multiple times")
+ parser.add_argument("test_type", choices=["blackbox", "whitebox"])
+ parser.add_argument("--simple", action="store_true")
+ parser.add_argument("--cf_consistency", action='store_true')
+ parser.add_argument("--txn", action='store_true')
+
+ all_params = dict(default_params.items()
+ + blackbox_default_params.items()
+ + whitebox_default_params.items()
+ + simple_default_params.items()
+ + blackbox_simple_default_params.items()
+ + whitebox_simple_default_params.items())
+
+ for k, v in all_params.items():
+ parser.add_argument("--" + k, type=type(v() if callable(v) else v))
+ # unknown_args are passed directly to db_stress
+ args, unknown_args = parser.parse_known_args()
+
+ test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
+ if test_tmpdir is not None and not os.path.isdir(test_tmpdir):
+ print('%s env var is set to a non-existent directory: %s' %
+ (_TEST_DIR_ENV_VAR, test_tmpdir))
+ sys.exit(1)
+
+ if args.test_type == 'blackbox':
+ blackbox_crash_main(args, unknown_args)
+ if args.test_type == 'whitebox':
+ whitebox_crash_main(args, unknown_args)
+
+if __name__ == '__main__':
+ main()
diff --git a/src/rocksdb/tools/db_repl_stress.cc b/src/rocksdb/tools/db_repl_stress.cc
new file mode 100644
index 000000000..717f5d3d8
--- /dev/null
+++ b/src/rocksdb/tools/db_repl_stress.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+ return 1;
+}
+#else
+
+#include <atomic>
+#include <cstdio>
+
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/types.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+
+// Run a thread to perform Put's.
+// Another thread uses GetUpdatesSince API to keep getting the updates.
+// options :
+// --num_inserts = the num of inserts the first thread should perform.
+// --wal_ttl = the wal ttl for the run.
+
+using namespace ROCKSDB_NAMESPACE;
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+struct DataPumpThread {
+ size_t no_records;
+ DB* db; // Assumption DB is Open'ed already.
+};
+
+static std::string RandomString(Random* rnd, int len) {
+ std::string r;
+ test::RandomString(rnd, len, &r);
+ return r;
+}
+
+static void DataPumpThreadBody(void* arg) {
+ DataPumpThread* t = reinterpret_cast<DataPumpThread*>(arg);
+ DB* db = t->db;
+ Random rnd(301);
+ size_t i = 0;
+ while (i++ < t->no_records) {
+ if (!db->Put(WriteOptions(), Slice(RandomString(&rnd, 500)),
+ Slice(RandomString(&rnd, 500)))
+ .ok()) {
+ fprintf(stderr, "Error in put\n");
+ exit(1);
+ }
+ }
+}
+
+struct ReplicationThread {
+ std::atomic<bool> stop;
+ DB* db;
+ volatile size_t no_read;
+};
+
+static void ReplicationThreadBody(void* arg) {
+ ReplicationThread* t = reinterpret_cast<ReplicationThread*>(arg);
+ DB* db = t->db;
+ std::unique_ptr<TransactionLogIterator> iter;
+ SequenceNumber currentSeqNum = 1;
+ while (!t->stop.load(std::memory_order_acquire)) {
+ iter.reset();
+ Status s;
+ while (!db->GetUpdatesSince(currentSeqNum, &iter).ok()) {
+ if (t->stop.load(std::memory_order_acquire)) {
+ return;
+ }
+ }
+ fprintf(stderr, "Refreshing iterator\n");
+ for (; iter->Valid(); iter->Next(), t->no_read++, currentSeqNum++) {
+ BatchResult res = iter->GetBatch();
+ if (res.sequence != currentSeqNum) {
+ fprintf(stderr, "Missed a seq no. b/w %ld and %ld\n",
+ (long)currentSeqNum, (long)res.sequence);
+ exit(1);
+ }
+ }
+ }
+}
+
+DEFINE_uint64(num_inserts, 1000,
+ "the num of inserts the first thread should"
+ " perform.");
+DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)");
+DEFINE_uint64(wal_size_limit_MB, 10,
+ "the wal size limit for the run"
+ "(in MB)");
+
+int main(int argc, const char** argv) {
+ SetUsageMessage(
+ std::string("\nUSAGE:\n") + std::string(argv[0]) +
+ " --num_inserts=<num_inserts> --wal_ttl_seconds=<WAL_ttl_seconds>" +
+ " --wal_size_limit_MB=<WAL_size_limit_MB>");
+ ParseCommandLineFlags(&argc, const_cast<char***>(&argv), true);
+
+ Env* env = Env::Default();
+ std::string default_db_path;
+ env->GetTestDirectory(&default_db_path);
+ default_db_path += "db_repl_stress";
+ Options options;
+ options.create_if_missing = true;
+ options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
+ options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
+ DB* db;
+ DestroyDB(default_db_path, options);
+
+ Status s = DB::Open(options, default_db_path, &db);
+
+ if (!s.ok()) {
+ fprintf(stderr, "Could not open DB due to %s\n", s.ToString().c_str());
+ exit(1);
+ }
+
+ DataPumpThread dataPump;
+ dataPump.no_records = FLAGS_num_inserts;
+ dataPump.db = db;
+ env->StartThread(DataPumpThreadBody, &dataPump);
+
+ ReplicationThread replThread;
+ replThread.db = db;
+ replThread.no_read = 0;
+ replThread.stop.store(false, std::memory_order_release);
+
+ env->StartThread(ReplicationThreadBody, &replThread);
+ while (replThread.no_read < FLAGS_num_inserts)
+ ;
+ replThread.stop.store(true, std::memory_order_release);
+ if (replThread.no_read < dataPump.no_records) {
+ // no. read should be => than inserted.
+ fprintf(stderr,
+ "No. of Record's written and read not same\nRead : %" ROCKSDB_PRIszt
+ " Written : %" ROCKSDB_PRIszt "\n",
+ replThread.no_read, dataPump.no_records);
+ exit(1);
+ }
+ fprintf(stderr, "Successful!\n");
+ exit(0);
+}
+
+#endif // GFLAGS
+
+#else // ROCKSDB_LITE
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "Not supported in lite mode.\n");
+ return 1;
+}
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/db_sanity_test.cc b/src/rocksdb/tools/db_sanity_test.cc
new file mode 100644
index 000000000..b483ee84f
--- /dev/null
+++ b/src/rocksdb/tools/db_sanity_test.cc
@@ -0,0 +1,297 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+#include <memory>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/filter_policy.h"
+#include "port/port.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SanityTest {
+ public:
+ explicit SanityTest(const std::string& path)
+ : env_(Env::Default()), path_(path) {
+ env_->CreateDirIfMissing(path);
+ }
+ virtual ~SanityTest() {}
+
+ virtual std::string Name() const = 0;
+ virtual Options GetOptions() const = 0;
+
+ Status Create() {
+ Options options = GetOptions();
+ options.create_if_missing = true;
+ std::string dbname = path_ + Name();
+ DestroyDB(dbname, options);
+ DB* db = nullptr;
+ Status s = DB::Open(options, dbname, &db);
+ std::unique_ptr<DB> db_guard(db);
+ if (!s.ok()) {
+ return s;
+ }
+ for (int i = 0; i < 1000000; ++i) {
+ std::string k = "key" + ToString(i);
+ std::string v = "value" + ToString(i);
+ s = db->Put(WriteOptions(), Slice(k), Slice(v));
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ return db->Flush(FlushOptions());
+ }
+ Status Verify() {
+ DB* db = nullptr;
+ std::string dbname = path_ + Name();
+ Status s = DB::Open(GetOptions(), dbname, &db);
+ std::unique_ptr<DB> db_guard(db);
+ if (!s.ok()) {
+ return s;
+ }
+ for (int i = 0; i < 1000000; ++i) {
+ std::string k = "key" + ToString(i);
+ std::string v = "value" + ToString(i);
+ std::string result;
+ s = db->Get(ReadOptions(), Slice(k), &result);
+ if (!s.ok()) {
+ return s;
+ }
+ if (result != v) {
+ return Status::Corruption("Unexpected value for key " + k);
+ }
+ }
+ return Status::OK();
+ }
+
+ private:
+ Env* env_;
+ std::string const path_;
+};
+
+class SanityTestBasic : public SanityTest {
+ public:
+ explicit SanityTestBasic(const std::string& path) : SanityTest(path) {}
+ virtual Options GetOptions() const override {
+ Options options;
+ options.create_if_missing = true;
+ return options;
+ }
+ virtual std::string Name() const override { return "Basic"; }
+};
+
+class SanityTestSpecialComparator : public SanityTest {
+ public:
+ explicit SanityTestSpecialComparator(const std::string& path)
+ : SanityTest(path) {
+ options_.comparator = new NewComparator();
+ }
+ ~SanityTestSpecialComparator() { delete options_.comparator; }
+ virtual Options GetOptions() const override { return options_; }
+ virtual std::string Name() const override { return "SpecialComparator"; }
+
+ private:
+ class NewComparator : public Comparator {
+ public:
+ virtual const char* Name() const override {
+ return "rocksdb.NewComparator";
+ }
+ virtual int Compare(const Slice& a, const Slice& b) const override {
+ return BytewiseComparator()->Compare(a, b);
+ }
+ virtual void FindShortestSeparator(std::string* s,
+ const Slice& l) const override {
+ BytewiseComparator()->FindShortestSeparator(s, l);
+ }
+ virtual void FindShortSuccessor(std::string* key) const override {
+ BytewiseComparator()->FindShortSuccessor(key);
+ }
+ };
+ Options options_;
+};
+
+class SanityTestZlibCompression : public SanityTest {
+ public:
+ explicit SanityTestZlibCompression(const std::string& path)
+ : SanityTest(path) {
+ options_.compression = kZlibCompression;
+ }
+ virtual Options GetOptions() const override { return options_; }
+ virtual std::string Name() const override { return "ZlibCompression"; }
+
+ private:
+ Options options_;
+};
+
+class SanityTestZlibCompressionVersion2 : public SanityTest {
+ public:
+ explicit SanityTestZlibCompressionVersion2(const std::string& path)
+ : SanityTest(path) {
+ options_.compression = kZlibCompression;
+ BlockBasedTableOptions table_options;
+#if ROCKSDB_MAJOR > 3 || (ROCKSDB_MAJOR == 3 && ROCKSDB_MINOR >= 10)
+ table_options.format_version = 2;
+#endif
+ options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ }
+ virtual Options GetOptions() const override { return options_; }
+ virtual std::string Name() const override {
+ return "ZlibCompressionVersion2";
+ }
+
+ private:
+ Options options_;
+};
+
+class SanityTestLZ4Compression : public SanityTest {
+ public:
+ explicit SanityTestLZ4Compression(const std::string& path)
+ : SanityTest(path) {
+ options_.compression = kLZ4Compression;
+ }
+ virtual Options GetOptions() const override { return options_; }
+ virtual std::string Name() const override { return "LZ4Compression"; }
+
+ private:
+ Options options_;
+};
+
+class SanityTestLZ4HCCompression : public SanityTest {
+ public:
+ explicit SanityTestLZ4HCCompression(const std::string& path)
+ : SanityTest(path) {
+ options_.compression = kLZ4HCCompression;
+ }
+ virtual Options GetOptions() const override { return options_; }
+ virtual std::string Name() const override { return "LZ4HCCompression"; }
+
+ private:
+ Options options_;
+};
+
+class SanityTestZSTDCompression : public SanityTest {
+ public:
+ explicit SanityTestZSTDCompression(const std::string& path)
+ : SanityTest(path) {
+ options_.compression = kZSTD;
+ }
+ virtual Options GetOptions() const override { return options_; }
+ virtual std::string Name() const override { return "ZSTDCompression"; }
+
+ private:
+ Options options_;
+};
+
+#ifndef ROCKSDB_LITE
+class SanityTestPlainTableFactory : public SanityTest {
+ public:
+ explicit SanityTestPlainTableFactory(const std::string& path)
+ : SanityTest(path) {
+ options_.table_factory.reset(NewPlainTableFactory());
+ options_.prefix_extractor.reset(NewFixedPrefixTransform(2));
+ options_.allow_mmap_reads = true;
+ }
+ ~SanityTestPlainTableFactory() {}
+ virtual Options GetOptions() const override { return options_; }
+ virtual std::string Name() const override { return "PlainTable"; }
+
+ private:
+ Options options_;
+};
+#endif // ROCKSDB_LITE
+
+class SanityTestBloomFilter : public SanityTest {
+ public:
+ explicit SanityTestBloomFilter(const std::string& path) : SanityTest(path) {
+ BlockBasedTableOptions table_options;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ }
+ ~SanityTestBloomFilter() {}
+ virtual Options GetOptions() const override { return options_; }
+ virtual std::string Name() const override { return "BloomFilter"; }
+
+ private:
+ Options options_;
+};
+
+namespace {
+bool RunSanityTests(const std::string& command, const std::string& path) {
+ bool result = true;
+// Suppress false positive clang static anaylzer warnings.
+#ifndef __clang_analyzer__
+ std::vector<SanityTest*> sanity_tests = {
+ new SanityTestBasic(path),
+ new SanityTestSpecialComparator(path),
+ new SanityTestZlibCompression(path),
+ new SanityTestZlibCompressionVersion2(path),
+ new SanityTestLZ4Compression(path),
+ new SanityTestLZ4HCCompression(path),
+ new SanityTestZSTDCompression(path),
+#ifndef ROCKSDB_LITE
+ new SanityTestPlainTableFactory(path),
+#endif // ROCKSDB_LITE
+ new SanityTestBloomFilter(path)};
+
+ if (command == "create") {
+ fprintf(stderr, "Creating...\n");
+ } else {
+ fprintf(stderr, "Verifying...\n");
+ }
+ for (auto sanity_test : sanity_tests) {
+ Status s;
+ fprintf(stderr, "%s -- ", sanity_test->Name().c_str());
+ if (command == "create") {
+ s = sanity_test->Create();
+ } else {
+ assert(command == "verify");
+ s = sanity_test->Verify();
+ }
+ fprintf(stderr, "%s\n", s.ToString().c_str());
+ if (!s.ok()) {
+ fprintf(stderr, "FAIL\n");
+ result = false;
+ }
+
+ delete sanity_test;
+ }
+#endif // __clang_analyzer__
+ return result;
+}
+} // namespace
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ std::string path, command;
+ bool ok = (argc == 3);
+ if (ok) {
+ path = std::string(argv[1]);
+ command = std::string(argv[2]);
+ ok = (command == "create" || command == "verify");
+ }
+ if (!ok) {
+ fprintf(stderr, "Usage: %s <path> [create|verify] \n", argv[0]);
+ exit(1);
+ }
+ if (path.back() != '/') {
+ path += "/";
+ }
+
+ bool sanity_ok = ROCKSDB_NAMESPACE::RunSanityTests(command, path);
+
+ return sanity_ok ? 0 : 1;
+}
diff --git a/src/rocksdb/tools/dbench_monitor b/src/rocksdb/tools/dbench_monitor
new file mode 100755
index 000000000..d85f9d070
--- /dev/null
+++ b/src/rocksdb/tools/dbench_monitor
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+#
+#(c) 2004-present, Facebook Inc. All rights reserved.
+#
+#see LICENSE file for more information on use/redistribution rights.
+#
+
+#
+#dbench_monitor: monitor db_bench process for violation of memory utilization
+#
+#default usage will monitor 'virtual memory size'. See below for standard options
+#passed to db_bench during this test.
+#
+# See also: ./pflag for the actual monitoring script that does the work
+#
+#NOTE:
+# You may end up with some /tmp/ files if db_bench OR
+# this script OR ./pflag was killed unceremoniously
+#
+# If you see the script taking a long time, trying "kill"
+# will usually cleanly exit.
+#
+#
+DIR=`dirname $0`
+LOG=/tmp/`basename $0`.$$
+DB_BENCH="$DIR/../db_bench";
+PFLAG=${DIR}/pflag
+
+usage() {
+ cat <<HELP; exit
+
+Usage: $0 [-h]
+
+-h: prints this help message
+
+This program will run the db_bench script to monitor memory usage
+using the 'pflag' program. It launches db_bench with default settings
+for certain arguments. You can change the defaults passed to
+'db_bench' program, by setting the following environment
+variables:
+
+ bs [block_size]
+ ztype [compression_type]
+ benches [benchmarks]
+ reads [reads]
+ threads [threads]
+ cs [cache_size]
+ vsize [value_size]
+ comp [compression_ratio]
+ num [num]
+
+See the code for more info
+
+HELP
+
+}
+
+[ ! -x ${DB_BENCH} ] && echo "WARNING: ${DB_BENCH} doesn't exist, abort!" && exit -1;
+
+[ "x$1" = "x-h" ] && usage;
+
+trap 'rm -f ${LOG}; kill ${PID}; echo "Interrupted, exiting";' 1 2 3 15
+
+touch $LOG;
+
+: ${bs:=16384}
+: ${ztype:=zlib}
+: ${benches:=readwhilewriting}
+: ${reads:=$((1*1024*1024))};
+: ${threads:=8}
+: ${vsize:=2000}
+: ${comp:=0.5}
+: ${num:=10000}
+: ${cs:=$((1*1024*1024*1024))};
+
+DEBUG=1 #Set to 0 to remove chattiness
+
+
+if [ "x$DEBUG" != "x" ]; then
+ #
+ #NOTE: under some circumstances, --use_existing_db may leave LOCK files under ${TMPDIR}/rocksdb/*
+ #cleanup the dir and re-run
+ #
+ echo DEBUG: Will run $DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db
+
+fi
+
+$DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db >$LOG 2>&1 &
+
+if [ $? -ne 0 ]; then
+ warn "WARNING: ${DB_BENCH} did not launch successfully! Abort!";
+ exit;
+fi
+PID=$!
+
+#
+#Start the monitoring. Default is "vsz" monitoring for upto cache_size ($cs) value of virtual mem
+#You could also monitor RSS and CPUTIME (bsdtime). Try 'pflag -h' for how to do this
+#
+${PFLAG} -p $PID -v
+
+rm -f $LOG;
diff --git a/src/rocksdb/tools/dump/db_dump_tool.cc b/src/rocksdb/tools/dump/db_dump_tool.cc
new file mode 100644
index 000000000..be3ff7962
--- /dev/null
+++ b/src/rocksdb/tools/dump/db_dump_tool.cc
@@ -0,0 +1,259 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <iostream>
+
+#include "rocksdb/db.h"
+#include "rocksdb/db_dump_tool.h"
+#include "rocksdb/env.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool DbDumpTool::Run(const DumpOptions& dump_options,
+ ROCKSDB_NAMESPACE::Options options) {
+ ROCKSDB_NAMESPACE::DB* dbptr;
+ ROCKSDB_NAMESPACE::Status status;
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> dumpfile;
+ char hostname[1024];
+ int64_t timesec = 0;
+ std::string abspath;
+ char json[4096];
+
+ static const char* magicstr = "ROCKDUMP";
+ static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1};
+
+ ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default();
+
+ // Open the database
+ options.create_if_missing = false;
+ status = ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, dump_options.db_path,
+ &dbptr);
+ if (!status.ok()) {
+ std::cerr << "Unable to open database '" << dump_options.db_path
+ << "' for reading: " << status.ToString() << std::endl;
+ return false;
+ }
+
+ const std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(dbptr);
+
+ status = env->NewWritableFile(dump_options.dump_location, &dumpfile,
+ ROCKSDB_NAMESPACE::EnvOptions());
+ if (!status.ok()) {
+ std::cerr << "Unable to open dump file '" << dump_options.dump_location
+ << "' for writing: " << status.ToString() << std::endl;
+ return false;
+ }
+
+ ROCKSDB_NAMESPACE::Slice magicslice(magicstr, 8);
+ status = dumpfile->Append(magicslice);
+ if (!status.ok()) {
+ std::cerr << "Append failed: " << status.ToString() << std::endl;
+ return false;
+ }
+
+ ROCKSDB_NAMESPACE::Slice versionslice(versionstr, 8);
+ status = dumpfile->Append(versionslice);
+ if (!status.ok()) {
+ std::cerr << "Append failed: " << status.ToString() << std::endl;
+ return false;
+ }
+
+ if (dump_options.anonymous) {
+ snprintf(json, sizeof(json), "{}");
+ } else {
+ status = env->GetHostName(hostname, sizeof(hostname));
+ status = env->GetCurrentTime(&timesec);
+ status = env->GetAbsolutePath(dump_options.db_path, &abspath);
+ snprintf(json, sizeof(json),
+ "{ \"database-path\": \"%s\", \"hostname\": \"%s\", "
+ "\"creation-time\": %" PRIi64 " }",
+ abspath.c_str(), hostname, timesec);
+ }
+
+ ROCKSDB_NAMESPACE::Slice infoslice(json, strlen(json));
+ char infosize[4];
+ ROCKSDB_NAMESPACE::EncodeFixed32(infosize, (uint32_t)infoslice.size());
+ ROCKSDB_NAMESPACE::Slice infosizeslice(infosize, 4);
+ status = dumpfile->Append(infosizeslice);
+ if (!status.ok()) {
+ std::cerr << "Append failed: " << status.ToString() << std::endl;
+ return false;
+ }
+ status = dumpfile->Append(infoslice);
+ if (!status.ok()) {
+ std::cerr << "Append failed: " << status.ToString() << std::endl;
+ return false;
+ }
+
+ const std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it(
+ db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions()));
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
+ char keysize[4];
+ ROCKSDB_NAMESPACE::EncodeFixed32(keysize, (uint32_t)it->key().size());
+ ROCKSDB_NAMESPACE::Slice keysizeslice(keysize, 4);
+ status = dumpfile->Append(keysizeslice);
+ if (!status.ok()) {
+ std::cerr << "Append failed: " << status.ToString() << std::endl;
+ return false;
+ }
+ status = dumpfile->Append(it->key());
+ if (!status.ok()) {
+ std::cerr << "Append failed: " << status.ToString() << std::endl;
+ return false;
+ }
+
+ char valsize[4];
+ ROCKSDB_NAMESPACE::EncodeFixed32(valsize, (uint32_t)it->value().size());
+ ROCKSDB_NAMESPACE::Slice valsizeslice(valsize, 4);
+ status = dumpfile->Append(valsizeslice);
+ if (!status.ok()) {
+ std::cerr << "Append failed: " << status.ToString() << std::endl;
+ return false;
+ }
+ status = dumpfile->Append(it->value());
+ if (!status.ok()) {
+ std::cerr << "Append failed: " << status.ToString() << std::endl;
+ return false;
+ }
+ }
+ if (!it->status().ok()) {
+ std::cerr << "Database iteration failed: " << status.ToString()
+ << std::endl;
+ return false;
+ }
+ return true;
+}
+
+bool DbUndumpTool::Run(const UndumpOptions& undump_options,
+ ROCKSDB_NAMESPACE::Options options) {
+ ROCKSDB_NAMESPACE::DB* dbptr;
+ ROCKSDB_NAMESPACE::Status status;
+ ROCKSDB_NAMESPACE::Env* env;
+ std::unique_ptr<ROCKSDB_NAMESPACE::SequentialFile> dumpfile;
+ ROCKSDB_NAMESPACE::Slice slice;
+ char scratch8[8];
+
+ static const char* magicstr = "ROCKDUMP";
+ static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1};
+
+ env = ROCKSDB_NAMESPACE::Env::Default();
+
+ status = env->NewSequentialFile(undump_options.dump_location, &dumpfile,
+ ROCKSDB_NAMESPACE::EnvOptions());
+ if (!status.ok()) {
+ std::cerr << "Unable to open dump file '" << undump_options.dump_location
+ << "' for reading: " << status.ToString() << std::endl;
+ return false;
+ }
+
+ status = dumpfile->Read(8, &slice, scratch8);
+ if (!status.ok() || slice.size() != 8 ||
+ memcmp(slice.data(), magicstr, 8) != 0) {
+ std::cerr << "File '" << undump_options.dump_location
+ << "' is not a recognizable dump file." << std::endl;
+ return false;
+ }
+
+ status = dumpfile->Read(8, &slice, scratch8);
+ if (!status.ok() || slice.size() != 8 ||
+ memcmp(slice.data(), versionstr, 8) != 0) {
+ std::cerr << "File '" << undump_options.dump_location
+ << "' version not recognized." << std::endl;
+ return false;
+ }
+
+ status = dumpfile->Read(4, &slice, scratch8);
+ if (!status.ok() || slice.size() != 4) {
+ std::cerr << "Unable to read info blob size." << std::endl;
+ return false;
+ }
+ uint32_t infosize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data());
+ status = dumpfile->Skip(infosize);
+ if (!status.ok()) {
+ std::cerr << "Unable to skip info blob: " << status.ToString() << std::endl;
+ return false;
+ }
+
+ options.create_if_missing = true;
+ status = ROCKSDB_NAMESPACE::DB::Open(options, undump_options.db_path, &dbptr);
+ if (!status.ok()) {
+ std::cerr << "Unable to open database '" << undump_options.db_path
+ << "' for writing: " << status.ToString() << std::endl;
+ return false;
+ }
+
+ const std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(dbptr);
+
+ uint32_t last_keysize = 64;
+ size_t last_valsize = 1 << 20;
+ std::unique_ptr<char[]> keyscratch(new char[last_keysize]);
+ std::unique_ptr<char[]> valscratch(new char[last_valsize]);
+
+ while (1) {
+ uint32_t keysize, valsize;
+ ROCKSDB_NAMESPACE::Slice keyslice;
+ ROCKSDB_NAMESPACE::Slice valslice;
+
+ status = dumpfile->Read(4, &slice, scratch8);
+ if (!status.ok() || slice.size() != 4) break;
+ keysize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data());
+ if (keysize > last_keysize) {
+ while (keysize > last_keysize) last_keysize *= 2;
+ keyscratch = std::unique_ptr<char[]>(new char[last_keysize]);
+ }
+
+ status = dumpfile->Read(keysize, &keyslice, keyscratch.get());
+ if (!status.ok() || keyslice.size() != keysize) {
+ std::cerr << "Key read failure: "
+ << (status.ok() ? "insufficient data" : status.ToString())
+ << std::endl;
+ return false;
+ }
+
+ status = dumpfile->Read(4, &slice, scratch8);
+ if (!status.ok() || slice.size() != 4) {
+ std::cerr << "Unable to read value size: "
+ << (status.ok() ? "insufficient data" : status.ToString())
+ << std::endl;
+ return false;
+ }
+ valsize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data());
+ if (valsize > last_valsize) {
+ while (valsize > last_valsize) last_valsize *= 2;
+ valscratch = std::unique_ptr<char[]>(new char[last_valsize]);
+ }
+
+ status = dumpfile->Read(valsize, &valslice, valscratch.get());
+ if (!status.ok() || valslice.size() != valsize) {
+ std::cerr << "Unable to read value: "
+ << (status.ok() ? "insufficient data" : status.ToString())
+ << std::endl;
+ return false;
+ }
+
+ status = db->Put(ROCKSDB_NAMESPACE::WriteOptions(), keyslice, valslice);
+ if (!status.ok()) {
+ fprintf(stderr, "Unable to write database entry\n");
+ return false;
+ }
+ }
+
+ if (undump_options.compact_db) {
+ status = db->CompactRange(ROCKSDB_NAMESPACE::CompactRangeOptions(), nullptr,
+ nullptr);
+ if (!status.ok()) {
+ fprintf(stderr,
+ "Unable to compact the database after loading the dumped file\n");
+ return false;
+ }
+ }
+ return true;
+}
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/dump/rocksdb_dump.cc b/src/rocksdb/tools/dump/rocksdb_dump.cc
new file mode 100644
index 000000000..358457e92
--- /dev/null
+++ b/src/rocksdb/tools/dump/rocksdb_dump.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#if !(defined GFLAGS) || defined(ROCKSDB_LITE)
+
+#include <cstdio>
+int main() {
+#ifndef GFLAGS
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+#endif
+#ifdef ROCKSDB_LITE
+ fprintf(stderr, "DbDumpTool is not supported in ROCKSDB_LITE\n");
+#endif
+ return 1;
+}
+
+#else
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/db_dump_tool.h"
+#include "util/gflags_compat.h"
+
+DEFINE_string(db_path, "", "Path to the db that will be dumped");
+DEFINE_string(dump_location, "", "Path to where the dump file location");
+DEFINE_bool(anonymous, false,
+ "Remove information like db path, creation time from dumped file");
+DEFINE_string(db_options, "",
+ "Options string used to open the database that will be dumped");
+
+int main(int argc, char** argv) {
+ GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+
+ if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
+ fprintf(stderr, "Please set --db_path and --dump_location\n");
+ return 1;
+ }
+
+ ROCKSDB_NAMESPACE::DumpOptions dump_options;
+ dump_options.db_path = FLAGS_db_path;
+ dump_options.dump_location = FLAGS_dump_location;
+ dump_options.anonymous = FLAGS_anonymous;
+
+ ROCKSDB_NAMESPACE::Options db_options;
+ if (FLAGS_db_options != "") {
+ ROCKSDB_NAMESPACE::Options parsed_options;
+ ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::GetOptionsFromString(
+ db_options, FLAGS_db_options, &parsed_options);
+ if (!s.ok()) {
+ fprintf(stderr, "Cannot parse provided db_options\n");
+ return 1;
+ }
+ db_options = parsed_options;
+ }
+
+ ROCKSDB_NAMESPACE::DbDumpTool tool;
+ if (!tool.Run(dump_options, db_options)) {
+ return 1;
+ }
+ return 0;
+}
+#endif // !(defined GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/tools/dump/rocksdb_undump.cc b/src/rocksdb/tools/dump/rocksdb_undump.cc
new file mode 100644
index 000000000..2ff128548
--- /dev/null
+++ b/src/rocksdb/tools/dump/rocksdb_undump.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#if !(defined GFLAGS) || defined(ROCKSDB_LITE)
+
+#include <cstdio>
+int main() {
+#ifndef GFLAGS
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+#endif
+#ifdef ROCKSDB_LITE
+ fprintf(stderr, "DbUndumpTool is not supported in ROCKSDB_LITE\n");
+#endif
+ return 1;
+}
+
+#else
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/db_dump_tool.h"
+#include "util/gflags_compat.h"
+
+DEFINE_string(dump_location, "", "Path to the dump file that will be loaded");
+DEFINE_string(db_path, "", "Path to the db that we will undump the file into");
+DEFINE_bool(compact, false, "Compact the db after loading the dumped file");
+DEFINE_string(db_options, "",
+ "Options string used to open the database that will be loaded");
+
+int main(int argc, char **argv) {
+ GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+
+ if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
+ fprintf(stderr, "Please set --db_path and --dump_location\n");
+ return 1;
+ }
+
+ ROCKSDB_NAMESPACE::UndumpOptions undump_options;
+ undump_options.db_path = FLAGS_db_path;
+ undump_options.dump_location = FLAGS_dump_location;
+ undump_options.compact_db = FLAGS_compact;
+
+ ROCKSDB_NAMESPACE::Options db_options;
+ if (FLAGS_db_options != "") {
+ ROCKSDB_NAMESPACE::Options parsed_options;
+ ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::GetOptionsFromString(
+ db_options, FLAGS_db_options, &parsed_options);
+ if (!s.ok()) {
+ fprintf(stderr, "Cannot parse provided db_options\n");
+ return 1;
+ }
+ db_options = parsed_options;
+ }
+
+ ROCKSDB_NAMESPACE::DbUndumpTool tool;
+ if (!tool.Run(undump_options, db_options)) {
+ return 1;
+ }
+ return 0;
+}
+#endif // !(defined GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/tools/generate_random_db.sh b/src/rocksdb/tools/generate_random_db.sh
new file mode 100755
index 000000000..5b5962617
--- /dev/null
+++ b/src/rocksdb/tools/generate_random_db.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# A shell script to load some pre generated data file to a DB using ldb tool
+# ./ldb needs to be avaible to be executed.
+#
+# Usage: <SCRIPT> <input_data_path> <DB Path>
+
+if [ "$#" -lt 2 ]; then
+ echo "usage: $BASH_SOURCE <input_data_path> <DB Path>"
+ exit 1
+fi
+
+input_data_dir=$1
+db_dir=$2
+rm -rf $db_dir
+
+echo == Loading data from $input_data_dir to $db_dir
+
+declare -a compression_opts=("no" "snappy" "zlib" "bzip2")
+
+set -e
+
+n=0
+
+for f in `ls -1 $input_data_dir`
+do
+ echo == Loading $f with compression ${compression_opts[n % 4]}
+ ./ldb load --db=$db_dir --compression_type=${compression_opts[n % 4]} --bloom_bits=10 --auto_compaction=false --create_if_missing < $input_data_dir/$f
+ let "n = n + 1"
+done
diff --git a/src/rocksdb/tools/ingest_external_sst.sh b/src/rocksdb/tools/ingest_external_sst.sh
new file mode 100755
index 000000000..8e2fed7ce
--- /dev/null
+++ b/src/rocksdb/tools/ingest_external_sst.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+#
+
+if [ "$#" -lt 2 ]; then
+ echo "usage: $BASH_SOURCE <DB Path> <External SST Dir>"
+ exit 1
+fi
+
+db_dir=$1
+external_sst_dir=$2
+
+for f in `find $external_sst_dir -name extern_sst*`
+do
+ echo == Ingesting external SST file $f to DB at $db_dir
+ ./ldb --db=$db_dir --create_if_missing ingest_extern_sst $f
+done
diff --git a/src/rocksdb/tools/ldb.cc b/src/rocksdb/tools/ldb.cc
new file mode 100644
index 000000000..482383be8
--- /dev/null
+++ b/src/rocksdb/tools/ldb.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/ldb_tool.h"
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::LDBTool tool;
+ tool.Run(argc, argv);
+ return 0;
+}
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "Not supported in lite mode.\n");
+ return 1;
+}
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/ldb_cmd.cc b/src/rocksdb/tools/ldb_cmd.cc
new file mode 100644
index 000000000..48d32d2ef
--- /dev/null
+++ b/src/rocksdb/tools/ldb_cmd.cc
@@ -0,0 +1,3437 @@
+
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/ldb_cmd.h"
+
+#include <cinttypes>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/log_reader.h"
+#include "db/write_batch_internal.h"
+#include "env/composite_env_wrapper.h"
+#include "file/filename.h"
+#include "port/port_dirent.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/utilities/backupable_db.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/debug.h"
+#include "rocksdb/utilities/options_util.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "tools/ldb_cmd_impl.h"
+#include "tools/sst_dump_tool_imp.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/file_checksum_helper.h"
+#include "util/stderr_logger.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+#include "utilities/ttl/db_ttl_impl.h"
+
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace ROCKSDB_NAMESPACE {
+
+class FileChecksumFuncCrc32c;
+
+const std::string LDBCommand::ARG_ENV_URI = "env_uri";
+const std::string LDBCommand::ARG_DB = "db";
+const std::string LDBCommand::ARG_PATH = "path";
+const std::string LDBCommand::ARG_SECONDARY_PATH = "secondary_path";
+const std::string LDBCommand::ARG_HEX = "hex";
+const std::string LDBCommand::ARG_KEY_HEX = "key_hex";
+const std::string LDBCommand::ARG_VALUE_HEX = "value_hex";
+const std::string LDBCommand::ARG_CF_NAME = "column_family";
+const std::string LDBCommand::ARG_TTL = "ttl";
+const std::string LDBCommand::ARG_TTL_START = "start_time";
+const std::string LDBCommand::ARG_TTL_END = "end_time";
+const std::string LDBCommand::ARG_TIMESTAMP = "timestamp";
+const std::string LDBCommand::ARG_TRY_LOAD_OPTIONS = "try_load_options";
+const std::string LDBCommand::ARG_IGNORE_UNKNOWN_OPTIONS =
+ "ignore_unknown_options";
+const std::string LDBCommand::ARG_FROM = "from";
+const std::string LDBCommand::ARG_TO = "to";
+const std::string LDBCommand::ARG_MAX_KEYS = "max_keys";
+const std::string LDBCommand::ARG_BLOOM_BITS = "bloom_bits";
+const std::string LDBCommand::ARG_FIX_PREFIX_LEN = "fix_prefix_len";
+const std::string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type";
+const std::string LDBCommand::ARG_COMPRESSION_MAX_DICT_BYTES =
+ "compression_max_dict_bytes";
+const std::string LDBCommand::ARG_BLOCK_SIZE = "block_size";
+const std::string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction";
+const std::string LDBCommand::ARG_DB_WRITE_BUFFER_SIZE = "db_write_buffer_size";
+const std::string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size";
+const std::string LDBCommand::ARG_FILE_SIZE = "file_size";
+const std::string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing";
+const std::string LDBCommand::ARG_NO_VALUE = "no_value";
+
+const char* LDBCommand::DELIM = " ==> ";
+
+namespace {
+
+void DumpWalFile(Options options, std::string wal_file, bool print_header,
+ bool print_values, bool is_write_committed,
+ LDBCommandExecuteResult* exec_state);
+
+void DumpSstFile(Options options, std::string filename, bool output_hex,
+ bool show_properties);
+};
+
+LDBCommand* LDBCommand::InitFromCmdLineArgs(
+ int argc, char** argv, const Options& options,
+ const LDBOptions& ldb_options,
+ const std::vector<ColumnFamilyDescriptor>* column_families) {
+ std::vector<std::string> args;
+ for (int i = 1; i < argc; i++) {
+ args.push_back(argv[i]);
+ }
+ return InitFromCmdLineArgs(args, options, ldb_options, column_families,
+ SelectCommand);
+}
+
+/**
+ * Parse the command-line arguments and create the appropriate LDBCommand2
+ * instance.
+ * The command line arguments must be in the following format:
+ * ./ldb --db=PATH_TO_DB [--commonOpt1=commonOpt1Val] ..
+ * COMMAND <PARAM1> <PARAM2> ... [-cmdSpecificOpt1=cmdSpecificOpt1Val] ..
+ * This is similar to the command line format used by HBaseClientTool.
+ * Command name is not included in args.
+ * Returns nullptr if the command-line cannot be parsed.
+ */
+LDBCommand* LDBCommand::InitFromCmdLineArgs(
+ const std::vector<std::string>& args, const Options& options,
+ const LDBOptions& ldb_options,
+ const std::vector<ColumnFamilyDescriptor>* /*column_families*/,
+ const std::function<LDBCommand*(const ParsedParams&)>& selector) {
+ // --x=y command line arguments are added as x->y map entries in
+ // parsed_params.option_map.
+ //
+ // Command-line arguments of the form --hex end up in this array as hex to
+ // parsed_params.flags
+ ParsedParams parsed_params;
+
+ // Everything other than option_map and flags. Represents commands
+ // and their parameters. For eg: put key1 value1 go into this vector.
+ std::vector<std::string> cmdTokens;
+
+ const std::string OPTION_PREFIX = "--";
+
+ for (const auto& arg : args) {
+ if (arg[0] == '-' && arg[1] == '-'){
+ std::vector<std::string> splits = StringSplit(arg, '=');
+ // --option_name=option_value
+ if (splits.size() == 2) {
+ std::string optionKey = splits[0].substr(OPTION_PREFIX.size());
+ parsed_params.option_map[optionKey] = splits[1];
+ } else if (splits.size() == 1) {
+ // --flag_name
+ std::string optionKey = splits[0].substr(OPTION_PREFIX.size());
+ parsed_params.flags.push_back(optionKey);
+ } else {
+ // --option_name=option_value, option_value contains '='
+ std::string optionKey = splits[0].substr(OPTION_PREFIX.size());
+ parsed_params.option_map[optionKey] =
+ arg.substr(splits[0].length() + 1);
+ }
+ } else {
+ cmdTokens.push_back(arg);
+ }
+ }
+
+ if (cmdTokens.size() < 1) {
+ fprintf(stderr, "Command not specified!");
+ return nullptr;
+ }
+
+ parsed_params.cmd = cmdTokens[0];
+ parsed_params.cmd_params.assign(cmdTokens.begin() + 1, cmdTokens.end());
+
+ LDBCommand* command = selector(parsed_params);
+
+ if (command) {
+ command->SetDBOptions(options);
+ command->SetLDBOptions(ldb_options);
+ }
+ return command;
+}
+
+LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) {
+ if (parsed_params.cmd == GetCommand::Name()) {
+ return new GetCommand(parsed_params.cmd_params, parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == PutCommand::Name()) {
+ return new PutCommand(parsed_params.cmd_params, parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == BatchPutCommand::Name()) {
+ return new BatchPutCommand(parsed_params.cmd_params,
+ parsed_params.option_map, parsed_params.flags);
+ } else if (parsed_params.cmd == ScanCommand::Name()) {
+ return new ScanCommand(parsed_params.cmd_params, parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == DeleteCommand::Name()) {
+ return new DeleteCommand(parsed_params.cmd_params, parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == DeleteRangeCommand::Name()) {
+ return new DeleteRangeCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == ApproxSizeCommand::Name()) {
+ return new ApproxSizeCommand(parsed_params.cmd_params,
+ parsed_params.option_map, parsed_params.flags);
+ } else if (parsed_params.cmd == DBQuerierCommand::Name()) {
+ return new DBQuerierCommand(parsed_params.cmd_params,
+ parsed_params.option_map, parsed_params.flags);
+ } else if (parsed_params.cmd == CompactorCommand::Name()) {
+ return new CompactorCommand(parsed_params.cmd_params,
+ parsed_params.option_map, parsed_params.flags);
+ } else if (parsed_params.cmd == WALDumperCommand::Name()) {
+ return new WALDumperCommand(parsed_params.cmd_params,
+ parsed_params.option_map, parsed_params.flags);
+ } else if (parsed_params.cmd == ReduceDBLevelsCommand::Name()) {
+ return new ReduceDBLevelsCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == ChangeCompactionStyleCommand::Name()) {
+ return new ChangeCompactionStyleCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == DBDumperCommand::Name()) {
+ return new DBDumperCommand(parsed_params.cmd_params,
+ parsed_params.option_map, parsed_params.flags);
+ } else if (parsed_params.cmd == DBLoaderCommand::Name()) {
+ return new DBLoaderCommand(parsed_params.cmd_params,
+ parsed_params.option_map, parsed_params.flags);
+ } else if (parsed_params.cmd == ManifestDumpCommand::Name()) {
+ return new ManifestDumpCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == FileChecksumDumpCommand::Name()) {
+ return new FileChecksumDumpCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == ListColumnFamiliesCommand::Name()) {
+ return new ListColumnFamiliesCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == CreateColumnFamilyCommand::Name()) {
+ return new CreateColumnFamilyCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == DropColumnFamilyCommand::Name()) {
+ return new DropColumnFamilyCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == DBFileDumperCommand::Name()) {
+ return new DBFileDumperCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == InternalDumpCommand::Name()) {
+ return new InternalDumpCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == CheckConsistencyCommand::Name()) {
+ return new CheckConsistencyCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == CheckPointCommand::Name()) {
+ return new CheckPointCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == RepairCommand::Name()) {
+ return new RepairCommand(parsed_params.cmd_params, parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == BackupCommand::Name()) {
+ return new BackupCommand(parsed_params.cmd_params, parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == RestoreCommand::Name()) {
+ return new RestoreCommand(parsed_params.cmd_params,
+ parsed_params.option_map, parsed_params.flags);
+ } else if (parsed_params.cmd == WriteExternalSstFilesCommand::Name()) {
+ return new WriteExternalSstFilesCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == IngestExternalSstFilesCommand::Name()) {
+ return new IngestExternalSstFilesCommand(parsed_params.cmd_params,
+ parsed_params.option_map,
+ parsed_params.flags);
+ } else if (parsed_params.cmd == ListFileRangeDeletesCommand::Name()) {
+ return new ListFileRangeDeletesCommand(parsed_params.option_map,
+ parsed_params.flags);
+ }
+ return nullptr;
+}
+
+/* Run the command, and return the execute result. */
+void LDBCommand::Run() {
+ if (!exec_state_.IsNotStarted()) {
+ return;
+ }
+
+ if (!options_.env || options_.env == Env::Default()) {
+ Env* env = Env::Default();
+ Status s = Env::LoadEnv(env_uri_, &env, &env_guard_);
+ if (!s.ok() && !s.IsNotFound()) {
+ fprintf(stderr, "LoadEnv: %s\n", s.ToString().c_str());
+ exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
+ return;
+ }
+ options_.env = env;
+ }
+
+ options_.file_system.reset(new LegacyFileSystemWrapper(options_.env));
+
+ if (db_ == nullptr && !NoDBOpen()) {
+ OpenDB();
+ if (exec_state_.IsFailed() && try_load_options_) {
+ // We don't always return if there is a failure because a WAL file or
+ // manifest file can be given to "dump" command so we should continue.
+ // --try_load_options is not valid in those cases.
+ return;
+ }
+ }
+
+ // We'll intentionally proceed even if the DB can't be opened because users
+ // can also specify a filename, not just a directory.
+ DoCommand();
+
+ if (exec_state_.IsNotStarted()) {
+ exec_state_ = LDBCommandExecuteResult::Succeed("");
+ }
+
+ if (db_ != nullptr) {
+ CloseDB();
+ }
+}
+
+LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags, bool is_read_only,
+ const std::vector<std::string>& valid_cmd_line_options)
+ : db_(nullptr),
+ db_ttl_(nullptr),
+ is_read_only_(is_read_only),
+ is_key_hex_(false),
+ is_value_hex_(false),
+ is_db_ttl_(false),
+ timestamp_(false),
+ try_load_options_(false),
+ ignore_unknown_options_(false),
+ create_if_missing_(false),
+ option_map_(options),
+ flags_(flags),
+ valid_cmd_line_options_(valid_cmd_line_options) {
+ std::map<std::string, std::string>::const_iterator itr = options.find(ARG_DB);
+ if (itr != options.end()) {
+ db_path_ = itr->second;
+ }
+
+ itr = options.find(ARG_ENV_URI);
+ if (itr != options.end()) {
+ env_uri_ = itr->second;
+ }
+
+ itr = options.find(ARG_CF_NAME);
+ if (itr != options.end()) {
+ column_family_name_ = itr->second;
+ } else {
+ column_family_name_ = kDefaultColumnFamilyName;
+ }
+
+ itr = options.find(ARG_SECONDARY_PATH);
+ secondary_path_ = "";
+ if (itr != options.end()) {
+ secondary_path_ = itr->second;
+ }
+
+ is_key_hex_ = IsKeyHex(options, flags);
+ is_value_hex_ = IsValueHex(options, flags);
+ is_db_ttl_ = IsFlagPresent(flags, ARG_TTL);
+ timestamp_ = IsFlagPresent(flags, ARG_TIMESTAMP);
+ try_load_options_ = IsFlagPresent(flags, ARG_TRY_LOAD_OPTIONS);
+ ignore_unknown_options_ = IsFlagPresent(flags, ARG_IGNORE_UNKNOWN_OPTIONS);
+}
+
+void LDBCommand::OpenDB() {
+ if (!create_if_missing_ && try_load_options_) {
+ Status s = LoadLatestOptions(db_path_, options_.env, &options_,
+ &column_families_, ignore_unknown_options_);
+ if (!s.ok() && !s.IsNotFound()) {
+ // Option file exists but load option file error.
+ std::string msg = s.ToString();
+ exec_state_ = LDBCommandExecuteResult::Failed(msg);
+ db_ = nullptr;
+ return;
+ }
+ if (options_.env->FileExists(options_.wal_dir).IsNotFound()) {
+ options_.wal_dir = db_path_;
+ fprintf(
+ stderr,
+ "wal_dir loaded from the option file doesn't exist. Ignore it.\n");
+ }
+
+ // If merge operator is not set, set a string append operator. There is
+ // no harm doing it.
+ for (auto& cf_entry : column_families_) {
+ if (!cf_entry.options.merge_operator) {
+ cf_entry.options.merge_operator =
+ MergeOperators::CreateStringAppendOperator(':');
+ }
+ }
+ }
+ options_ = PrepareOptionsForOpenDB();
+ if (!exec_state_.IsNotStarted()) {
+ return;
+ }
+ if (column_families_.empty() && !options_.merge_operator) {
+ // No harm to add a general merge operator if it is not specified.
+ options_.merge_operator = MergeOperators::CreateStringAppendOperator(':');
+ }
+ // Open the DB.
+ Status st;
+ std::vector<ColumnFamilyHandle*> handles_opened;
+ if (is_db_ttl_) {
+ // ldb doesn't yet support TTL DB with multiple column families
+ if (!column_family_name_.empty() || !column_families_.empty()) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "ldb doesn't support TTL DB with multiple column families");
+ }
+ if (!secondary_path_.empty()) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Open as secondary is not supported for TTL DB yet.");
+ }
+ if (is_read_only_) {
+ st = DBWithTTL::Open(options_, db_path_, &db_ttl_, 0, true);
+ } else {
+ st = DBWithTTL::Open(options_, db_path_, &db_ttl_);
+ }
+ db_ = db_ttl_;
+ } else {
+ if (column_families_.empty()) {
+ // Try to figure out column family lists
+ std::vector<std::string> cf_list;
+ st = DB::ListColumnFamilies(options_, db_path_, &cf_list);
+ // There is possible the DB doesn't exist yet, for "create if not
+ // "existing case". The failure is ignored here. We rely on DB::Open()
+ // to give us the correct error message for problem with opening
+ // existing DB.
+ if (st.ok() && cf_list.size() > 1) {
+ // Ignore single column family DB.
+ for (auto cf_name : cf_list) {
+ column_families_.emplace_back(cf_name, options_);
+ }
+ }
+ }
+ if (is_read_only_ && secondary_path_.empty()) {
+ if (column_families_.empty()) {
+ st = DB::OpenForReadOnly(options_, db_path_, &db_);
+ } else {
+ st = DB::OpenForReadOnly(options_, db_path_, column_families_,
+ &handles_opened, &db_);
+ }
+ } else {
+ if (column_families_.empty()) {
+ if (secondary_path_.empty()) {
+ st = DB::Open(options_, db_path_, &db_);
+ } else {
+ st = DB::OpenAsSecondary(options_, db_path_, secondary_path_, &db_);
+ }
+ } else {
+ if (secondary_path_.empty()) {
+ st = DB::Open(options_, db_path_, column_families_, &handles_opened,
+ &db_);
+ } else {
+ st = DB::OpenAsSecondary(options_, db_path_, secondary_path_,
+ column_families_, &handles_opened, &db_);
+ }
+ }
+ }
+ }
+ if (!st.ok()) {
+ std::string msg = st.ToString();
+ exec_state_ = LDBCommandExecuteResult::Failed(msg);
+ } else if (!handles_opened.empty()) {
+ assert(handles_opened.size() == column_families_.size());
+ bool found_cf_name = false;
+ for (size_t i = 0; i < handles_opened.size(); i++) {
+ cf_handles_[column_families_[i].name] = handles_opened[i];
+ if (column_family_name_ == column_families_[i].name) {
+ found_cf_name = true;
+ }
+ }
+ if (!found_cf_name) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Non-existing column family " + column_family_name_);
+ CloseDB();
+ }
+ } else {
+ // We successfully opened DB in single column family mode.
+ assert(column_families_.empty());
+ if (column_family_name_ != kDefaultColumnFamilyName) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Non-existing column family " + column_family_name_);
+ CloseDB();
+ }
+ }
+}
+
+void LDBCommand::CloseDB() {
+ if (db_ != nullptr) {
+ for (auto& pair : cf_handles_) {
+ delete pair.second;
+ }
+ delete db_;
+ db_ = nullptr;
+ }
+}
+
+ColumnFamilyHandle* LDBCommand::GetCfHandle() {
+ if (!cf_handles_.empty()) {
+ auto it = cf_handles_.find(column_family_name_);
+ if (it == cf_handles_.end()) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Cannot find column family " + column_family_name_);
+ } else {
+ return it->second;
+ }
+ }
+ return db_->DefaultColumnFamily();
+}
+
+std::vector<std::string> LDBCommand::BuildCmdLineOptions(
+ std::vector<std::string> options) {
+ std::vector<std::string> ret = {ARG_ENV_URI,
+ ARG_DB,
+ ARG_SECONDARY_PATH,
+ ARG_BLOOM_BITS,
+ ARG_BLOCK_SIZE,
+ ARG_AUTO_COMPACTION,
+ ARG_COMPRESSION_TYPE,
+ ARG_COMPRESSION_MAX_DICT_BYTES,
+ ARG_WRITE_BUFFER_SIZE,
+ ARG_FILE_SIZE,
+ ARG_FIX_PREFIX_LEN,
+ ARG_TRY_LOAD_OPTIONS,
+ ARG_IGNORE_UNKNOWN_OPTIONS,
+ ARG_CF_NAME};
+ ret.insert(ret.end(), options.begin(), options.end());
+ return ret;
+}
+
+/**
+ * Parses the specific integer option and fills in the value.
+ * Returns true if the option is found.
+ * Returns false if the option is not found or if there is an error parsing the
+ * value. If there is an error, the specified exec_state is also
+ * updated.
+ */
+bool LDBCommand::ParseIntOption(
+ const std::map<std::string, std::string>& /*options*/,
+ const std::string& option, int& value,
+ LDBCommandExecuteResult& exec_state) {
+ std::map<std::string, std::string>::const_iterator itr =
+ option_map_.find(option);
+ if (itr != option_map_.end()) {
+ try {
+#if defined(CYGWIN)
+ value = strtol(itr->second.c_str(), 0, 10);
+#else
+ value = std::stoi(itr->second);
+#endif
+ return true;
+ } catch (const std::invalid_argument&) {
+ exec_state =
+ LDBCommandExecuteResult::Failed(option + " has an invalid value.");
+ } catch (const std::out_of_range&) {
+ exec_state = LDBCommandExecuteResult::Failed(
+ option + " has a value out-of-range.");
+ }
+ }
+ return false;
+}
+
+/**
+ * Parses the specified option and fills in the value.
+ * Returns true if the option is found.
+ * Returns false otherwise.
+ */
+bool LDBCommand::ParseStringOption(
+ const std::map<std::string, std::string>& /*options*/,
+ const std::string& option, std::string* value) {
+ auto itr = option_map_.find(option);
+ if (itr != option_map_.end()) {
+ *value = itr->second;
+ return true;
+ }
+ return false;
+}
+
+Options LDBCommand::PrepareOptionsForOpenDB() {
+ ColumnFamilyOptions* cf_opts;
+ auto column_families_iter =
+ std::find_if(column_families_.begin(), column_families_.end(),
+ [this](const ColumnFamilyDescriptor& cf_desc) {
+ return cf_desc.name == column_family_name_;
+ });
+ if (column_families_iter != column_families_.end()) {
+ cf_opts = &column_families_iter->options;
+ } else {
+ cf_opts = static_cast<ColumnFamilyOptions*>(&options_);
+ }
+ DBOptions* db_opts = static_cast<DBOptions*>(&options_);
+ db_opts->create_if_missing = false;
+
+ std::map<std::string, std::string>::const_iterator itr;
+
+ BlockBasedTableOptions table_options;
+ bool use_table_options = false;
+ int bits;
+ if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) {
+ if (bits > 0) {
+ use_table_options = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(bits));
+ } else {
+ exec_state_ =
+ LDBCommandExecuteResult::Failed(ARG_BLOOM_BITS + " must be > 0.");
+ }
+ }
+
+ int block_size;
+ if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) {
+ if (block_size > 0) {
+ use_table_options = true;
+ table_options.block_size = block_size;
+ } else {
+ exec_state_ =
+ LDBCommandExecuteResult::Failed(ARG_BLOCK_SIZE + " must be > 0.");
+ }
+ }
+
+ if (use_table_options) {
+ cf_opts->table_factory.reset(NewBlockBasedTableFactory(table_options));
+ }
+
+ itr = option_map_.find(ARG_AUTO_COMPACTION);
+ if (itr != option_map_.end()) {
+ cf_opts->disable_auto_compactions = !StringToBool(itr->second);
+ }
+
+ itr = option_map_.find(ARG_COMPRESSION_TYPE);
+ if (itr != option_map_.end()) {
+ std::string comp = itr->second;
+ if (comp == "no") {
+ cf_opts->compression = kNoCompression;
+ } else if (comp == "snappy") {
+ cf_opts->compression = kSnappyCompression;
+ } else if (comp == "zlib") {
+ cf_opts->compression = kZlibCompression;
+ } else if (comp == "bzip2") {
+ cf_opts->compression = kBZip2Compression;
+ } else if (comp == "lz4") {
+ cf_opts->compression = kLZ4Compression;
+ } else if (comp == "lz4hc") {
+ cf_opts->compression = kLZ4HCCompression;
+ } else if (comp == "xpress") {
+ cf_opts->compression = kXpressCompression;
+ } else if (comp == "zstd") {
+ cf_opts->compression = kZSTD;
+ } else {
+ // Unknown compression.
+ exec_state_ =
+ LDBCommandExecuteResult::Failed("Unknown compression level: " + comp);
+ }
+ }
+
+ int compression_max_dict_bytes;
+ if (ParseIntOption(option_map_, ARG_COMPRESSION_MAX_DICT_BYTES,
+ compression_max_dict_bytes, exec_state_)) {
+ if (compression_max_dict_bytes >= 0) {
+ cf_opts->compression_opts.max_dict_bytes = compression_max_dict_bytes;
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ ARG_COMPRESSION_MAX_DICT_BYTES + " must be >= 0.");
+ }
+ }
+
+ int db_write_buffer_size;
+ if (ParseIntOption(option_map_, ARG_DB_WRITE_BUFFER_SIZE,
+ db_write_buffer_size, exec_state_)) {
+ if (db_write_buffer_size >= 0) {
+ db_opts->db_write_buffer_size = db_write_buffer_size;
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(ARG_DB_WRITE_BUFFER_SIZE +
+ " must be >= 0.");
+ }
+ }
+
+ int write_buffer_size;
+ if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size,
+ exec_state_)) {
+ if (write_buffer_size > 0) {
+ cf_opts->write_buffer_size = write_buffer_size;
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(ARG_WRITE_BUFFER_SIZE +
+ " must be > 0.");
+ }
+ }
+
+ int file_size;
+ if (ParseIntOption(option_map_, ARG_FILE_SIZE, file_size, exec_state_)) {
+ if (file_size > 0) {
+ cf_opts->target_file_size_base = file_size;
+ } else {
+ exec_state_ =
+ LDBCommandExecuteResult::Failed(ARG_FILE_SIZE + " must be > 0.");
+ }
+ }
+
+ if (db_opts->db_paths.size() == 0) {
+ db_opts->db_paths.emplace_back(db_path_,
+ std::numeric_limits<uint64_t>::max());
+ }
+
+ int fix_prefix_len;
+ if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN, fix_prefix_len,
+ exec_state_)) {
+ if (fix_prefix_len > 0) {
+ cf_opts->prefix_extractor.reset(
+ NewFixedPrefixTransform(static_cast<size_t>(fix_prefix_len)));
+ } else {
+ exec_state_ =
+ LDBCommandExecuteResult::Failed(ARG_FIX_PREFIX_LEN + " must be > 0.");
+ }
+ }
+
+ // TODO(ajkr): this return value doesn't reflect the CF options changed, so
+ // subcommands that rely on this won't see the effect of CF-related CLI args.
+ // Such subcommands need to be changed to properly support CFs.
+ return options_;
+}
+
+bool LDBCommand::ParseKeyValue(const std::string& line, std::string* key,
+ std::string* value, bool is_key_hex,
+ bool is_value_hex) {
+ size_t pos = line.find(DELIM);
+ if (pos != std::string::npos) {
+ *key = line.substr(0, pos);
+ *value = line.substr(pos + strlen(DELIM));
+ if (is_key_hex) {
+ *key = HexToString(*key);
+ }
+ if (is_value_hex) {
+ *value = HexToString(*value);
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/**
+ * Make sure that ONLY the command-line options and flags expected by this
+ * command are specified on the command-line. Extraneous options are usually
+ * the result of user error.
+ * Returns true if all checks pass. Else returns false, and prints an
+ * appropriate error msg to stderr.
+ */
+bool LDBCommand::ValidateCmdLineOptions() {
+ for (std::map<std::string, std::string>::const_iterator itr =
+ option_map_.begin();
+ itr != option_map_.end(); ++itr) {
+ if (std::find(valid_cmd_line_options_.begin(),
+ valid_cmd_line_options_.end(),
+ itr->first) == valid_cmd_line_options_.end()) {
+ fprintf(stderr, "Invalid command-line option %s\n", itr->first.c_str());
+ return false;
+ }
+ }
+
+ for (std::vector<std::string>::const_iterator itr = flags_.begin();
+ itr != flags_.end(); ++itr) {
+ if (std::find(valid_cmd_line_options_.begin(),
+ valid_cmd_line_options_.end(),
+ *itr) == valid_cmd_line_options_.end()) {
+ fprintf(stderr, "Invalid command-line flag %s\n", itr->c_str());
+ return false;
+ }
+ }
+
+ if (!NoDBOpen() && option_map_.find(ARG_DB) == option_map_.end() &&
+ option_map_.find(ARG_PATH) == option_map_.end()) {
+ fprintf(stderr, "Either %s or %s must be specified.\n", ARG_DB.c_str(),
+ ARG_PATH.c_str());
+ return false;
+ }
+
+ return true;
+}
+
+std::string LDBCommand::HexToString(const std::string& str) {
+ std::string result;
+ std::string::size_type len = str.length();
+ if (len < 2 || str[0] != '0' || str[1] != 'x') {
+ fprintf(stderr, "Invalid hex input %s. Must start with 0x\n", str.c_str());
+ throw "Invalid hex input";
+ }
+ if (!Slice(str.data() + 2, len - 2).DecodeHex(&result)) {
+ throw "Invalid hex input";
+ }
+ return result;
+}
+
+std::string LDBCommand::StringToHex(const std::string& str) {
+ std::string result("0x");
+ result.append(Slice(str).ToString(true));
+ return result;
+}
+
+std::string LDBCommand::PrintKeyValue(const std::string& key,
+ const std::string& value, bool is_key_hex,
+ bool is_value_hex) {
+ std::string result;
+ result.append(is_key_hex ? StringToHex(key) : key);
+ result.append(DELIM);
+ result.append(is_value_hex ? StringToHex(value) : value);
+ return result;
+}
+
+std::string LDBCommand::PrintKeyValue(const std::string& key,
+ const std::string& value, bool is_hex) {
+ return PrintKeyValue(key, value, is_hex, is_hex);
+}
+
+std::string LDBCommand::HelpRangeCmdArgs() {
+ std::ostringstream str_stream;
+ str_stream << " ";
+ str_stream << "[--" << ARG_FROM << "] ";
+ str_stream << "[--" << ARG_TO << "] ";
+ return str_stream.str();
+}
+
+bool LDBCommand::IsKeyHex(const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags) {
+ return (IsFlagPresent(flags, ARG_HEX) || IsFlagPresent(flags, ARG_KEY_HEX) ||
+ ParseBooleanOption(options, ARG_HEX, false) ||
+ ParseBooleanOption(options, ARG_KEY_HEX, false));
+}
+
+bool LDBCommand::IsValueHex(const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags) {
+ return (IsFlagPresent(flags, ARG_HEX) ||
+ IsFlagPresent(flags, ARG_VALUE_HEX) ||
+ ParseBooleanOption(options, ARG_HEX, false) ||
+ ParseBooleanOption(options, ARG_VALUE_HEX, false));
+}
+
+bool LDBCommand::ParseBooleanOption(
+ const std::map<std::string, std::string>& options,
+ const std::string& option, bool default_val) {
+ std::map<std::string, std::string>::const_iterator itr = options.find(option);
+ if (itr != options.end()) {
+ std::string option_val = itr->second;
+ return StringToBool(itr->second);
+ }
+ return default_val;
+}
+
+bool LDBCommand::StringToBool(std::string val) {
+ std::transform(val.begin(), val.end(), val.begin(),
+ [](char ch) -> char { return (char)::tolower(ch); });
+
+ if (val == "true") {
+ return true;
+ } else if (val == "false") {
+ return false;
+ } else {
+ throw "Invalid value for boolean argument";
+ }
+}
+
+CompactorCommand::CompactorCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, false,
+ BuildCmdLineOptions({ARG_FROM, ARG_TO, ARG_HEX, ARG_KEY_HEX,
+ ARG_VALUE_HEX, ARG_TTL})),
+ null_from_(true),
+ null_to_(true) {
+ std::map<std::string, std::string>::const_iterator itr =
+ options.find(ARG_FROM);
+ if (itr != options.end()) {
+ null_from_ = false;
+ from_ = itr->second;
+ }
+
+ itr = options.find(ARG_TO);
+ if (itr != options.end()) {
+ null_to_ = false;
+ to_ = itr->second;
+ }
+
+ if (is_key_hex_) {
+ if (!null_from_) {
+ from_ = HexToString(from_);
+ }
+ if (!null_to_) {
+ to_ = HexToString(to_);
+ }
+ }
+}
+
+void CompactorCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(CompactorCommand::Name());
+ ret.append(HelpRangeCmdArgs());
+ ret.append("\n");
+}
+
+void CompactorCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+
+ Slice* begin = nullptr;
+ Slice* end = nullptr;
+ if (!null_from_) {
+ begin = new Slice(from_);
+ }
+ if (!null_to_) {
+ end = new Slice(to_);
+ }
+
+ CompactRangeOptions cro;
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+
+ db_->CompactRange(cro, GetCfHandle(), begin, end);
+ exec_state_ = LDBCommandExecuteResult::Succeed("");
+
+ delete begin;
+ delete end;
+}
+
+// ---------------------------------------------------------------------------
+const std::string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal";
+const std::string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load";
+const std::string DBLoaderCommand::ARG_COMPACT = "compact";
+
+DBLoaderCommand::DBLoaderCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(
+ options, flags, false,
+ BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM,
+ ARG_TO, ARG_CREATE_IF_MISSING, ARG_DISABLE_WAL,
+ ARG_BULK_LOAD, ARG_COMPACT})),
+ disable_wal_(false),
+ bulk_load_(false),
+ compact_(false) {
+ create_if_missing_ = IsFlagPresent(flags, ARG_CREATE_IF_MISSING);
+ disable_wal_ = IsFlagPresent(flags, ARG_DISABLE_WAL);
+ bulk_load_ = IsFlagPresent(flags, ARG_BULK_LOAD);
+ compact_ = IsFlagPresent(flags, ARG_COMPACT);
+}
+
+void DBLoaderCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(DBLoaderCommand::Name());
+ ret.append(" [--" + ARG_CREATE_IF_MISSING + "]");
+ ret.append(" [--" + ARG_DISABLE_WAL + "]");
+ ret.append(" [--" + ARG_BULK_LOAD + "]");
+ ret.append(" [--" + ARG_COMPACT + "]");
+ ret.append("\n");
+}
+
+Options DBLoaderCommand::PrepareOptionsForOpenDB() {
+ Options opt = LDBCommand::PrepareOptionsForOpenDB();
+ opt.create_if_missing = create_if_missing_;
+ if (bulk_load_) {
+ opt.PrepareForBulkLoad();
+ }
+ return opt;
+}
+
+void DBLoaderCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+
+ WriteOptions write_options;
+ if (disable_wal_) {
+ write_options.disableWAL = true;
+ }
+
+ int bad_lines = 0;
+ std::string line;
+ // prefer ifstream getline performance vs that from std::cin istream
+ std::ifstream ifs_stdin("/dev/stdin");
+ std::istream* istream_p = ifs_stdin.is_open() ? &ifs_stdin : &std::cin;
+ while (getline(*istream_p, line, '\n')) {
+ std::string key;
+ std::string value;
+ if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) {
+ db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value));
+ } else if (0 == line.find("Keys in range:")) {
+ // ignore this line
+ } else if (0 == line.find("Created bg thread 0x")) {
+ // ignore this line
+ } else {
+ bad_lines ++;
+ }
+ }
+
+ if (bad_lines > 0) {
+ std::cout << "Warning: " << bad_lines << " bad lines ignored." << std::endl;
+ }
+ if (compact_) {
+ db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, nullptr);
+ }
+}
+
+// ----------------------------------------------------------------------------
+
+namespace {
+
+void DumpManifestFile(Options options, std::string file, bool verbose, bool hex,
+ bool json) {
+ EnvOptions sopt;
+ std::string dbname("dummy");
+ std::shared_ptr<Cache> tc(NewLRUCache(options.max_open_files - 10,
+ options.table_cache_numshardbits));
+ // Notice we are using the default options not through SanitizeOptions(),
+ // if VersionSet::DumpManifest() depends on any option done by
+ // SanitizeOptions(), we need to initialize it manually.
+ options.db_paths.emplace_back("dummy", 0);
+ options.num_levels = 64;
+ WriteController wc(options.delayed_write_rate);
+ WriteBufferManager wb(options.db_write_buffer_size);
+ ImmutableDBOptions immutable_db_options(options);
+ VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
+ /*block_cache_tracer=*/nullptr);
+ Status s = versions.DumpManifest(options, file, verbose, hex, json);
+ if (!s.ok()) {
+ fprintf(stderr, "Error in processing file %s %s\n", file.c_str(),
+ s.ToString().c_str());
+ }
+}
+
+} // namespace
+
+const std::string ManifestDumpCommand::ARG_VERBOSE = "verbose";
+const std::string ManifestDumpCommand::ARG_JSON = "json";
+const std::string ManifestDumpCommand::ARG_PATH = "path";
+
+void ManifestDumpCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(ManifestDumpCommand::Name());
+ ret.append(" [--" + ARG_VERBOSE + "]");
+ ret.append(" [--" + ARG_JSON + "]");
+ ret.append(" [--" + ARG_PATH + "=<path_to_manifest_file>]");
+ ret.append("\n");
+}
+
+ManifestDumpCommand::ManifestDumpCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(
+ options, flags, false,
+ BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX, ARG_JSON})),
+ verbose_(false),
+ json_(false),
+ path_("") {
+ verbose_ = IsFlagPresent(flags, ARG_VERBOSE);
+ json_ = IsFlagPresent(flags, ARG_JSON);
+
+ std::map<std::string, std::string>::const_iterator itr =
+ options.find(ARG_PATH);
+ if (itr != options.end()) {
+ path_ = itr->second;
+ if (path_.empty()) {
+ exec_state_ = LDBCommandExecuteResult::Failed("--path: missing pathname");
+ }
+ }
+}
+
+void ManifestDumpCommand::DoCommand() {
+
+ std::string manifestfile;
+
+ if (!path_.empty()) {
+ manifestfile = path_;
+ } else {
+ // We need to find the manifest file by searching the directory
+ // containing the db for files of the form MANIFEST_[0-9]+
+
+ std::vector<std::string> files;
+ Status s = options_.env->GetChildren(db_path_, &files);
+ if (!s.ok()) {
+ std::string err_msg = s.ToString();
+ err_msg.append(": Failed to list the content of ");
+ err_msg.append(db_path_);
+ exec_state_ = LDBCommandExecuteResult::Failed(err_msg);
+ return;
+ }
+ const std::string kManifestNamePrefix = "MANIFEST-";
+ std::string matched_file;
+#ifdef OS_WIN
+ const char kPathDelim = '\\';
+#else
+ const char kPathDelim = '/';
+#endif
+ for (const auto& file_path : files) {
+ // Some Env::GetChildren() return absolute paths. Some directories' path
+ // end with path delim, e.g. '/' or '\\'.
+ size_t pos = file_path.find_last_of(kPathDelim);
+ if (pos == file_path.size() - 1) {
+ continue;
+ }
+ std::string fname;
+ if (pos != std::string::npos) {
+ // Absolute path.
+ fname.assign(file_path, pos + 1, file_path.size() - pos - 1);
+ } else {
+ fname = file_path;
+ }
+ uint64_t file_num = 0;
+ FileType file_type = kLogFile; // Just for initialization
+ if (ParseFileName(fname, &file_num, &file_type) &&
+ file_type == kDescriptorFile) {
+ if (!matched_file.empty()) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Multiple MANIFEST files found; use --path to select one");
+ return;
+ } else {
+ matched_file.swap(fname);
+ }
+ }
+ }
+ if (matched_file.empty()) {
+ std::string err_msg("No MANIFEST found in ");
+ err_msg.append(db_path_);
+ exec_state_ = LDBCommandExecuteResult::Failed(err_msg);
+ return;
+ }
+ if (db_path_[db_path_.length() - 1] != '/') {
+ db_path_.append("/");
+ }
+ manifestfile = db_path_ + matched_file;
+ }
+
+ if (verbose_) {
+ fprintf(stdout, "Processing Manifest file %s\n", manifestfile.c_str());
+ }
+
+ DumpManifestFile(options_, manifestfile, verbose_, is_key_hex_, json_);
+
+ if (verbose_) {
+ fprintf(stdout, "Processing Manifest file %s done\n", manifestfile.c_str());
+ }
+}
+
+// ----------------------------------------------------------------------------
+namespace {
+
+void GetLiveFilesChecksumInfoFromVersionSet(Options options,
+ const std::string& db_path,
+ FileChecksumList* checksum_list) {
+ EnvOptions sopt;
+ Status s;
+ std::string dbname(db_path);
+ std::shared_ptr<Cache> tc(NewLRUCache(options.max_open_files - 10,
+ options.table_cache_numshardbits));
+ // Notice we are using the default options not through SanitizeOptions(),
+ // if VersionSet::GetLiveFilesChecksumInfo depends on any option done by
+ // SanitizeOptions(), we need to initialize it manually.
+ options.db_paths.emplace_back(db_path, 0);
+ options.num_levels = 64;
+ WriteController wc(options.delayed_write_rate);
+ WriteBufferManager wb(options.db_write_buffer_size);
+ ImmutableDBOptions immutable_db_options(options);
+ VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
+ /*block_cache_tracer=*/nullptr);
+ std::vector<std::string> cf_name_list;
+ s = versions.ListColumnFamilies(&cf_name_list, db_path,
+ options.file_system.get());
+ if (s.ok()) {
+ std::vector<ColumnFamilyDescriptor> cf_list;
+ for (const auto& name : cf_name_list) {
+ cf_list.emplace_back(name, ColumnFamilyOptions(options));
+ }
+ s = versions.Recover(cf_list, true);
+ }
+ if (s.ok()) {
+ s = versions.GetLiveFilesChecksumInfo(checksum_list);
+ }
+ if (!s.ok()) {
+ fprintf(stderr, "Error Status: %s", s.ToString().c_str());
+ }
+}
+
+} // namespace
+
+const std::string FileChecksumDumpCommand::ARG_PATH = "path";
+
+void FileChecksumDumpCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(FileChecksumDumpCommand::Name());
+ ret.append(" [--" + ARG_PATH + "=<path_to_manifest_file>]");
+ ret.append("\n");
+}
+
+FileChecksumDumpCommand::FileChecksumDumpCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, false, BuildCmdLineOptions({ARG_PATH})),
+ path_("") {
+ std::map<std::string, std::string>::const_iterator itr =
+ options.find(ARG_PATH);
+ if (itr != options.end()) {
+ path_ = itr->second;
+ if (path_.empty()) {
+ exec_state_ = LDBCommandExecuteResult::Failed("--path: missing pathname");
+ }
+ }
+}
+
+void FileChecksumDumpCommand::DoCommand() {
+ // print out the checksum information in the following format:
+ // sst file number, checksum function name, checksum value
+ // sst file number, checksum function name, checksum value
+ // ......
+
+ std::unique_ptr<FileChecksumList> checksum_list(NewFileChecksumList());
+ GetLiveFilesChecksumInfoFromVersionSet(options_, db_path_,
+ checksum_list.get());
+ if (checksum_list != nullptr) {
+ std::vector<uint64_t> file_numbers;
+ std::vector<std::string> checksums;
+ std::vector<std::string> checksum_func_names;
+ Status s = checksum_list->GetAllFileChecksums(&file_numbers, &checksums,
+ &checksum_func_names);
+ if (s.ok()) {
+ for (size_t i = 0; i < file_numbers.size(); i++) {
+ assert(i < file_numbers.size());
+ assert(i < checksums.size());
+ assert(i < checksum_func_names.size());
+ fprintf(stdout, "%" PRId64 ", %s, %s\n", file_numbers[i],
+ checksum_func_names[i].c_str(), checksums[i].c_str());
+ }
+ }
+ fprintf(stdout, "Print SST file checksum information finished \n");
+ }
+}
+
+// ----------------------------------------------------------------------------
+
+void ListColumnFamiliesCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(ListColumnFamiliesCommand::Name());
+ ret.append("\n");
+}
+
+ListColumnFamiliesCommand::ListColumnFamiliesCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {}
+
+void ListColumnFamiliesCommand::DoCommand() {
+ std::vector<std::string> column_families;
+ Status s = DB::ListColumnFamilies(options_, db_path_, &column_families);
+ if (!s.ok()) {
+ fprintf(stderr, "Error in processing db %s %s\n", db_path_.c_str(),
+ s.ToString().c_str());
+ } else {
+ fprintf(stdout, "Column families in %s: \n{", db_path_.c_str());
+ bool first = true;
+ for (auto cf : column_families) {
+ if (!first) {
+ fprintf(stdout, ", ");
+ }
+ first = false;
+ fprintf(stdout, "%s", cf.c_str());
+ }
+ fprintf(stdout, "}\n");
+ }
+}
+
+void CreateColumnFamilyCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(CreateColumnFamilyCommand::Name());
+ ret.append(" --db=<db_path> <new_column_family_name>");
+ ret.append("\n");
+}
+
+CreateColumnFamilyCommand::CreateColumnFamilyCommand(
+ const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, true, {ARG_DB}) {
+ if (params.size() != 1) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "new column family name must be specified");
+ } else {
+ new_cf_name_ = params[0];
+ }
+}
+
+void CreateColumnFamilyCommand::DoCommand() {
+ ColumnFamilyHandle* new_cf_handle = nullptr;
+ Status st = db_->CreateColumnFamily(options_, new_cf_name_, &new_cf_handle);
+ if (st.ok()) {
+ fprintf(stdout, "OK\n");
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Fail to create new column family: " + st.ToString());
+ }
+ delete new_cf_handle;
+ CloseDB();
+}
+
+void DropColumnFamilyCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(DropColumnFamilyCommand::Name());
+ ret.append(" --db=<db_path> <column_family_name_to_drop>");
+ ret.append("\n");
+}
+
+DropColumnFamilyCommand::DropColumnFamilyCommand(
+ const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, true, {ARG_DB}) {
+ if (params.size() != 1) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "The name of column family to drop must be specified");
+ } else {
+ cf_name_to_drop_ = params[0];
+ }
+}
+
+void DropColumnFamilyCommand::DoCommand() {
+ auto iter = cf_handles_.find(cf_name_to_drop_);
+ if (iter == cf_handles_.end()) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Column family: " + cf_name_to_drop_ + " doesn't exist in db.");
+ return;
+ }
+ ColumnFamilyHandle* cf_handle_to_drop = iter->second;
+ Status st = db_->DropColumnFamily(cf_handle_to_drop);
+ if (st.ok()) {
+ fprintf(stdout, "OK\n");
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Fail to drop column family: " + st.ToString());
+ }
+ CloseDB();
+}
+
+// ----------------------------------------------------------------------------
+namespace {
+
+// This function only called when it's the sane case of >1 buckets in time-range
+// Also called only when timekv falls between ttl_start and ttl_end provided
+void IncBucketCounts(std::vector<uint64_t>& bucket_counts, int ttl_start,
+ int time_range, int bucket_size, int timekv,
+ int num_buckets) {
+#ifdef NDEBUG
+ (void)time_range;
+ (void)num_buckets;
+#endif
+ assert(time_range > 0 && timekv >= ttl_start && bucket_size > 0 &&
+ timekv < (ttl_start + time_range) && num_buckets > 1);
+ int bucket = (timekv - ttl_start) / bucket_size;
+ bucket_counts[bucket]++;
+}
+
+void PrintBucketCounts(const std::vector<uint64_t>& bucket_counts,
+ int ttl_start, int ttl_end, int bucket_size,
+ int num_buckets) {
+ int time_point = ttl_start;
+ for(int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) {
+ fprintf(stdout, "Keys in range %s to %s : %lu\n",
+ TimeToHumanString(time_point).c_str(),
+ TimeToHumanString(time_point + bucket_size).c_str(),
+ (unsigned long)bucket_counts[i]);
+ }
+ fprintf(stdout, "Keys in range %s to %s : %lu\n",
+ TimeToHumanString(time_point).c_str(),
+ TimeToHumanString(ttl_end).c_str(),
+ (unsigned long)bucket_counts[num_buckets - 1]);
+}
+
+} // namespace
+
+const std::string InternalDumpCommand::ARG_COUNT_ONLY = "count_only";
+const std::string InternalDumpCommand::ARG_COUNT_DELIM = "count_delim";
+const std::string InternalDumpCommand::ARG_STATS = "stats";
+const std::string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex";
+
+InternalDumpCommand::InternalDumpCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(
+ options, flags, true,
+ BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM,
+ ARG_TO, ARG_MAX_KEYS, ARG_COUNT_ONLY,
+ ARG_COUNT_DELIM, ARG_STATS, ARG_INPUT_KEY_HEX})),
+ has_from_(false),
+ has_to_(false),
+ max_keys_(-1),
+ delim_("."),
+ count_only_(false),
+ count_delim_(false),
+ print_stats_(false),
+ is_input_key_hex_(false) {
+ has_from_ = ParseStringOption(options, ARG_FROM, &from_);
+ has_to_ = ParseStringOption(options, ARG_TO, &to_);
+
+ ParseIntOption(options, ARG_MAX_KEYS, max_keys_, exec_state_);
+ std::map<std::string, std::string>::const_iterator itr =
+ options.find(ARG_COUNT_DELIM);
+ if (itr != options.end()) {
+ delim_ = itr->second;
+ count_delim_ = true;
+ // fprintf(stdout,"delim = %c\n",delim_[0]);
+ } else {
+ count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM);
+ delim_=".";
+ }
+
+ print_stats_ = IsFlagPresent(flags, ARG_STATS);
+ count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY);
+ is_input_key_hex_ = IsFlagPresent(flags, ARG_INPUT_KEY_HEX);
+
+ if (is_input_key_hex_) {
+ if (has_from_) {
+ from_ = HexToString(from_);
+ }
+ if (has_to_) {
+ to_ = HexToString(to_);
+ }
+ }
+}
+
+void InternalDumpCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(InternalDumpCommand::Name());
+ ret.append(HelpRangeCmdArgs());
+ ret.append(" [--" + ARG_INPUT_KEY_HEX + "]");
+ ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+ ret.append(" [--" + ARG_COUNT_ONLY + "]");
+ ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]");
+ ret.append(" [--" + ARG_STATS + "]");
+ ret.append("\n");
+}
+
+void InternalDumpCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+
+ if (print_stats_) {
+ std::string stats;
+ if (db_->GetProperty(GetCfHandle(), "rocksdb.stats", &stats)) {
+ fprintf(stdout, "%s\n", stats.c_str());
+ }
+ }
+
+ // Cast as DBImpl to get internal iterator
+ std::vector<KeyVersion> key_versions;
+ Status st = GetAllKeyVersions(db_, GetCfHandle(), from_, to_, max_keys_,
+ &key_versions);
+ if (!st.ok()) {
+ exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+ return;
+ }
+ std::string rtype1, rtype2, row, val;
+ rtype2 = "";
+ uint64_t c=0;
+ uint64_t s1=0,s2=0;
+
+ long long count = 0;
+ for (auto& key_version : key_versions) {
+ InternalKey ikey(key_version.user_key, key_version.sequence,
+ static_cast<ValueType>(key_version.type));
+ if (has_to_ && ikey.user_key() == to_) {
+ // GetAllKeyVersions() includes keys with user key `to_`, but idump has
+ // traditionally excluded such keys.
+ break;
+ }
+ ++count;
+ int k;
+ if (count_delim_) {
+ rtype1 = "";
+ s1=0;
+ row = ikey.Encode().ToString();
+ val = key_version.value;
+ for(k=0;row[k]!='\x01' && row[k]!='\0';k++)
+ s1++;
+ for(k=0;val[k]!='\x01' && val[k]!='\0';k++)
+ s1++;
+ for(int j=0;row[j]!=delim_[0] && row[j]!='\0' && row[j]!='\x01';j++)
+ rtype1+=row[j];
+ if(rtype2.compare("") && rtype2.compare(rtype1)!=0) {
+ fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+ rtype2.c_str(), c, s2);
+ c=1;
+ s2=s1;
+ rtype2 = rtype1;
+ } else {
+ c++;
+ s2+=s1;
+ rtype2=rtype1;
+ }
+ }
+
+ if (!count_only_ && !count_delim_) {
+ std::string key = ikey.DebugString(is_key_hex_);
+ std::string value = Slice(key_version.value).ToString(is_value_hex_);
+ std::cout << key << " => " << value << "\n";
+ }
+
+ // Terminate if maximum number of keys have been dumped
+ if (max_keys_ > 0 && count >= max_keys_) break;
+ }
+ if(count_delim_) {
+ fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+ rtype2.c_str(), c, s2);
+ } else {
+ fprintf(stdout, "Internal keys in range: %lld\n", count);
+ }
+}
+
+const std::string DBDumperCommand::ARG_COUNT_ONLY = "count_only";
+const std::string DBDumperCommand::ARG_COUNT_DELIM = "count_delim";
+const std::string DBDumperCommand::ARG_STATS = "stats";
+const std::string DBDumperCommand::ARG_TTL_BUCKET = "bucket";
+
+DBDumperCommand::DBDumperCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, true,
+ BuildCmdLineOptions(
+ {ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM,
+ ARG_TO, ARG_MAX_KEYS, ARG_COUNT_ONLY, ARG_COUNT_DELIM,
+ ARG_STATS, ARG_TTL_START, ARG_TTL_END, ARG_TTL_BUCKET,
+ ARG_TIMESTAMP, ARG_PATH})),
+ null_from_(true),
+ null_to_(true),
+ max_keys_(-1),
+ count_only_(false),
+ count_delim_(false),
+ print_stats_(false) {
+ std::map<std::string, std::string>::const_iterator itr =
+ options.find(ARG_FROM);
+ if (itr != options.end()) {
+ null_from_ = false;
+ from_ = itr->second;
+ }
+
+ itr = options.find(ARG_TO);
+ if (itr != options.end()) {
+ null_to_ = false;
+ to_ = itr->second;
+ }
+
+ itr = options.find(ARG_MAX_KEYS);
+ if (itr != options.end()) {
+ try {
+#if defined(CYGWIN)
+ max_keys_ = strtol(itr->second.c_str(), 0, 10);
+#else
+ max_keys_ = std::stoi(itr->second);
+#endif
+ } catch (const std::invalid_argument&) {
+ exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS +
+ " has an invalid value");
+ } catch (const std::out_of_range&) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ ARG_MAX_KEYS + " has a value out-of-range");
+ }
+ }
+ itr = options.find(ARG_COUNT_DELIM);
+ if (itr != options.end()) {
+ delim_ = itr->second;
+ count_delim_ = true;
+ } else {
+ count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM);
+ delim_=".";
+ }
+
+ print_stats_ = IsFlagPresent(flags, ARG_STATS);
+ count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY);
+
+ if (is_key_hex_) {
+ if (!null_from_) {
+ from_ = HexToString(from_);
+ }
+ if (!null_to_) {
+ to_ = HexToString(to_);
+ }
+ }
+
+ itr = options.find(ARG_PATH);
+ if (itr != options.end()) {
+ path_ = itr->second;
+ if (db_path_.empty()) {
+ db_path_ = path_;
+ }
+ }
+}
+
+void DBDumperCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(DBDumperCommand::Name());
+ ret.append(HelpRangeCmdArgs());
+ ret.append(" [--" + ARG_TTL + "]");
+ ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+ ret.append(" [--" + ARG_TIMESTAMP + "]");
+ ret.append(" [--" + ARG_COUNT_ONLY + "]");
+ ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]");
+ ret.append(" [--" + ARG_STATS + "]");
+ ret.append(" [--" + ARG_TTL_BUCKET + "=<N>]");
+ ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
+ ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
+ ret.append(" [--" + ARG_PATH + "=<path_to_a_file>]");
+ ret.append("\n");
+}
+
+/**
+ * Handles two separate cases:
+ *
+ * 1) --db is specified - just dump the database.
+ *
+ * 2) --path is specified - determine based on file extension what dumping
+ * function to call. Please note that we intentionally use the extension
+ * and avoid probing the file contents under the assumption that renaming
+ * the files is not a supported scenario.
+ *
+ */
+void DBDumperCommand::DoCommand() {
+ if (!db_) {
+ assert(!path_.empty());
+ std::string fileName = GetFileNameFromPath(path_);
+ uint64_t number;
+ FileType type;
+
+ exec_state_ = LDBCommandExecuteResult::Succeed("");
+
+ if (!ParseFileName(fileName, &number, &type)) {
+ exec_state_ =
+ LDBCommandExecuteResult::Failed("Can't parse file type: " + path_);
+ return;
+ }
+
+ switch (type) {
+ case kLogFile:
+ // TODO(myabandeh): allow configuring is_write_commited
+ DumpWalFile(options_, path_, /* print_header_ */ true,
+ /* print_values_ */ true, true /* is_write_commited */,
+ &exec_state_);
+ break;
+ case kTableFile:
+ DumpSstFile(options_, path_, is_key_hex_, /* show_properties */ true);
+ break;
+ case kDescriptorFile:
+ DumpManifestFile(options_, path_, /* verbose_ */ false, is_key_hex_,
+ /* json_ */ false);
+ break;
+ default:
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "File type not supported: " + path_);
+ break;
+ }
+
+ } else {
+ DoDumpCommand();
+ }
+}
+
+void DBDumperCommand::DoDumpCommand() {
+ assert(nullptr != db_);
+ assert(path_.empty());
+
+ // Parse command line args
+ uint64_t count = 0;
+ if (print_stats_) {
+ std::string stats;
+ if (db_->GetProperty("rocksdb.stats", &stats)) {
+ fprintf(stdout, "%s\n", stats.c_str());
+ }
+ }
+
+ // Setup key iterator
+ ReadOptions scan_read_opts;
+ scan_read_opts.total_order_seek = true;
+ Iterator* iter = db_->NewIterator(scan_read_opts, GetCfHandle());
+ Status st = iter->status();
+ if (!st.ok()) {
+ exec_state_ =
+ LDBCommandExecuteResult::Failed("Iterator error." + st.ToString());
+ }
+
+ if (!null_from_) {
+ iter->Seek(from_);
+ } else {
+ iter->SeekToFirst();
+ }
+
+ int max_keys = max_keys_;
+ int ttl_start;
+ if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) {
+ ttl_start = DBWithTTLImpl::kMinTimestamp; // TTL introduction time
+ }
+ int ttl_end;
+ if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) {
+ ttl_end = DBWithTTLImpl::kMaxTimestamp; // Max time allowed by TTL feature
+ }
+ if (ttl_end < ttl_start) {
+ fprintf(stderr, "Error: End time can't be less than start time\n");
+ delete iter;
+ return;
+ }
+ int time_range = ttl_end - ttl_start;
+ int bucket_size;
+ if (!ParseIntOption(option_map_, ARG_TTL_BUCKET, bucket_size, exec_state_) ||
+ bucket_size <= 0) {
+ bucket_size = time_range; // Will have just 1 bucket by default
+ }
+ //cretaing variables for row count of each type
+ std::string rtype1, rtype2, row, val;
+ rtype2 = "";
+ uint64_t c=0;
+ uint64_t s1=0,s2=0;
+
+ // At this point, bucket_size=0 => time_range=0
+ int num_buckets = (bucket_size >= time_range)
+ ? 1
+ : ((time_range + bucket_size - 1) / bucket_size);
+ std::vector<uint64_t> bucket_counts(num_buckets, 0);
+ if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) {
+ fprintf(stdout, "Dumping key-values from %s to %s\n",
+ TimeToHumanString(ttl_start).c_str(),
+ TimeToHumanString(ttl_end).c_str());
+ }
+
+ HistogramImpl vsize_hist;
+
+ for (; iter->Valid(); iter->Next()) {
+ int rawtime = 0;
+ // If end marker was specified, we stop before it
+ if (!null_to_ && (iter->key().ToString() >= to_))
+ break;
+ // Terminate if maximum number of keys have been dumped
+ if (max_keys == 0)
+ break;
+ if (is_db_ttl_) {
+ TtlIterator* it_ttl = static_cast_with_check<TtlIterator, Iterator>(iter);
+ rawtime = it_ttl->timestamp();
+ if (rawtime < ttl_start || rawtime >= ttl_end) {
+ continue;
+ }
+ }
+ if (max_keys > 0) {
+ --max_keys;
+ }
+ if (is_db_ttl_ && num_buckets > 1) {
+ IncBucketCounts(bucket_counts, ttl_start, time_range, bucket_size,
+ rawtime, num_buckets);
+ }
+ ++count;
+ if (count_delim_) {
+ rtype1 = "";
+ row = iter->key().ToString();
+ val = iter->value().ToString();
+ s1 = row.size()+val.size();
+ for(int j=0;row[j]!=delim_[0] && row[j]!='\0';j++)
+ rtype1+=row[j];
+ if(rtype2.compare("") && rtype2.compare(rtype1)!=0) {
+ fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+ rtype2.c_str(), c, s2);
+ c=1;
+ s2=s1;
+ rtype2 = rtype1;
+ } else {
+ c++;
+ s2+=s1;
+ rtype2=rtype1;
+ }
+
+ }
+
+ if (count_only_) {
+ vsize_hist.Add(iter->value().size());
+ }
+
+ if (!count_only_ && !count_delim_) {
+ if (is_db_ttl_ && timestamp_) {
+ fprintf(stdout, "%s ", TimeToHumanString(rawtime).c_str());
+ }
+ std::string str =
+ PrintKeyValue(iter->key().ToString(), iter->value().ToString(),
+ is_key_hex_, is_value_hex_);
+ fprintf(stdout, "%s\n", str.c_str());
+ }
+ }
+
+ if (num_buckets > 1 && is_db_ttl_) {
+ PrintBucketCounts(bucket_counts, ttl_start, ttl_end, bucket_size,
+ num_buckets);
+ } else if(count_delim_) {
+ fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+ rtype2.c_str(), c, s2);
+ } else {
+ fprintf(stdout, "Keys in range: %" PRIu64 "\n", count);
+ }
+
+ if (count_only_) {
+ fprintf(stdout, "Value size distribution: \n");
+ fprintf(stdout, "%s\n", vsize_hist.ToString().c_str());
+ }
+ // Clean up
+ delete iter;
+}
+
+const std::string ReduceDBLevelsCommand::ARG_NEW_LEVELS = "new_levels";
+const std::string ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS =
+ "print_old_levels";
+
+ReduceDBLevelsCommand::ReduceDBLevelsCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, false,
+ BuildCmdLineOptions({ARG_NEW_LEVELS, ARG_PRINT_OLD_LEVELS})),
+ old_levels_(1 << 7),
+ new_levels_(-1),
+ print_old_levels_(false) {
+ ParseIntOption(option_map_, ARG_NEW_LEVELS, new_levels_, exec_state_);
+ print_old_levels_ = IsFlagPresent(flags, ARG_PRINT_OLD_LEVELS);
+
+ if(new_levels_ <= 0) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ " Use --" + ARG_NEW_LEVELS + " to specify a new level number\n");
+ }
+}
+
+std::vector<std::string> ReduceDBLevelsCommand::PrepareArgs(
+ const std::string& db_path, int new_levels, bool print_old_level) {
+ std::vector<std::string> ret;
+ ret.push_back("reduce_levels");
+ ret.push_back("--" + ARG_DB + "=" + db_path);
+ ret.push_back("--" + ARG_NEW_LEVELS + "=" +
+ ROCKSDB_NAMESPACE::ToString(new_levels));
+ if(print_old_level) {
+ ret.push_back("--" + ARG_PRINT_OLD_LEVELS);
+ }
+ return ret;
+}
+
+void ReduceDBLevelsCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(ReduceDBLevelsCommand::Name());
+ ret.append(" --" + ARG_NEW_LEVELS + "=<New number of levels>");
+ ret.append(" [--" + ARG_PRINT_OLD_LEVELS + "]");
+ ret.append("\n");
+}
+
+Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() {
+ Options opt = LDBCommand::PrepareOptionsForOpenDB();
+ opt.num_levels = old_levels_;
+ opt.max_bytes_for_level_multiplier_additional.resize(opt.num_levels, 1);
+ // Disable size compaction
+ opt.max_bytes_for_level_base = 1ULL << 50;
+ opt.max_bytes_for_level_multiplier = 1;
+ return opt;
+}
+
+Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
+ int* levels) {
+ ImmutableDBOptions db_options(opt);
+ EnvOptions soptions;
+ std::shared_ptr<Cache> tc(
+ NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits));
+ const InternalKeyComparator cmp(opt.comparator);
+ WriteController wc(opt.delayed_write_rate);
+ WriteBufferManager wb(opt.db_write_buffer_size);
+ VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc,
+ /*block_cache_tracer=*/nullptr);
+ std::vector<ColumnFamilyDescriptor> dummy;
+ ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
+ ColumnFamilyOptions(opt));
+ dummy.push_back(dummy_descriptor);
+ // We rely the VersionSet::Recover to tell us the internal data structures
+ // in the db. And the Recover() should never do any change
+ // (like LogAndApply) to the manifest file.
+ Status st = versions.Recover(dummy);
+ if (!st.ok()) {
+ return st;
+ }
+ int max = -1;
+ auto default_cfd = versions.GetColumnFamilySet()->GetDefault();
+ for (int i = 0; i < default_cfd->NumberLevels(); i++) {
+ if (default_cfd->current()->storage_info()->NumLevelFiles(i)) {
+ max = i;
+ }
+ }
+
+ *levels = max + 1;
+ return st;
+}
+
+void ReduceDBLevelsCommand::DoCommand() {
+ if (new_levels_ <= 1) {
+ exec_state_ =
+ LDBCommandExecuteResult::Failed("Invalid number of levels.\n");
+ return;
+ }
+
+ Status st;
+ Options opt = PrepareOptionsForOpenDB();
+ int old_level_num = -1;
+ opt.file_system.reset(new LegacyFileSystemWrapper(opt.env));
+ ;
+ st = GetOldNumOfLevels(opt, &old_level_num);
+ if (!st.ok()) {
+ exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+ return;
+ }
+
+ if (print_old_levels_) {
+ fprintf(stdout, "The old number of levels in use is %d\n", old_level_num);
+ }
+
+ if (old_level_num <= new_levels_) {
+ return;
+ }
+
+ old_levels_ = old_level_num;
+
+ OpenDB();
+ if (exec_state_.IsFailed()) {
+ return;
+ }
+ assert(db_ != nullptr);
+ // Compact the whole DB to put all files to the highest level.
+ fprintf(stdout, "Compacting the db...\n");
+ db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, nullptr);
+ CloseDB();
+
+ EnvOptions soptions;
+ st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_);
+ if (!st.ok()) {
+ exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+ return;
+ }
+}
+
+const std::string ChangeCompactionStyleCommand::ARG_OLD_COMPACTION_STYLE =
+ "old_compaction_style";
+const std::string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE =
+ "new_compaction_style";
+
+ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, false,
+ BuildCmdLineOptions(
+ {ARG_OLD_COMPACTION_STYLE, ARG_NEW_COMPACTION_STYLE})),
+ old_compaction_style_(-1),
+ new_compaction_style_(-1) {
+ ParseIntOption(option_map_, ARG_OLD_COMPACTION_STYLE, old_compaction_style_,
+ exec_state_);
+ if (old_compaction_style_ != kCompactionStyleLevel &&
+ old_compaction_style_ != kCompactionStyleUniversal) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Use --" + ARG_OLD_COMPACTION_STYLE + " to specify old compaction " +
+ "style. Check ldb help for proper compaction style value.\n");
+ return;
+ }
+
+ ParseIntOption(option_map_, ARG_NEW_COMPACTION_STYLE, new_compaction_style_,
+ exec_state_);
+ if (new_compaction_style_ != kCompactionStyleLevel &&
+ new_compaction_style_ != kCompactionStyleUniversal) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Use --" + ARG_NEW_COMPACTION_STYLE + " to specify new compaction " +
+ "style. Check ldb help for proper compaction style value.\n");
+ return;
+ }
+
+ if (new_compaction_style_ == old_compaction_style_) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Old compaction style is the same as new compaction style. "
+ "Nothing to do.\n");
+ return;
+ }
+
+ if (old_compaction_style_ == kCompactionStyleUniversal &&
+ new_compaction_style_ == kCompactionStyleLevel) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Convert from universal compaction to level compaction. "
+ "Nothing to do.\n");
+ return;
+ }
+}
+
+void ChangeCompactionStyleCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(ChangeCompactionStyleCommand::Name());
+ ret.append(" --" + ARG_OLD_COMPACTION_STYLE + "=<Old compaction style: 0 " +
+ "for level compaction, 1 for universal compaction>");
+ ret.append(" --" + ARG_NEW_COMPACTION_STYLE + "=<New compaction style: 0 " +
+ "for level compaction, 1 for universal compaction>");
+ ret.append("\n");
+}
+
+Options ChangeCompactionStyleCommand::PrepareOptionsForOpenDB() {
+ Options opt = LDBCommand::PrepareOptionsForOpenDB();
+
+ if (old_compaction_style_ == kCompactionStyleLevel &&
+ new_compaction_style_ == kCompactionStyleUniversal) {
+ // In order to convert from level compaction to universal compaction, we
+ // need to compact all data into a single file and move it to level 0.
+ opt.disable_auto_compactions = true;
+ opt.target_file_size_base = INT_MAX;
+ opt.target_file_size_multiplier = 1;
+ opt.max_bytes_for_level_base = INT_MAX;
+ opt.max_bytes_for_level_multiplier = 1;
+ }
+
+ return opt;
+}
+
+void ChangeCompactionStyleCommand::DoCommand() {
+ // print db stats before we have made any change
+ std::string property;
+ std::string files_per_level;
+ for (int i = 0; i < db_->NumberLevels(GetCfHandle()); i++) {
+ db_->GetProperty(GetCfHandle(),
+ "rocksdb.num-files-at-level" + NumberToString(i),
+ &property);
+
+ // format print string
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str());
+ files_per_level += buf;
+ }
+ fprintf(stdout, "files per level before compaction: %s\n",
+ files_per_level.c_str());
+
+ // manual compact into a single file and move the file to level 0
+ CompactRangeOptions compact_options;
+ compact_options.change_level = true;
+ compact_options.target_level = 0;
+ db_->CompactRange(compact_options, GetCfHandle(), nullptr, nullptr);
+
+ // verify compaction result
+ files_per_level = "";
+ int num_files = 0;
+ for (int i = 0; i < db_->NumberLevels(GetCfHandle()); i++) {
+ db_->GetProperty(GetCfHandle(),
+ "rocksdb.num-files-at-level" + NumberToString(i),
+ &property);
+
+ // format print string
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str());
+ files_per_level += buf;
+
+ num_files = atoi(property.c_str());
+
+ // level 0 should have only 1 file
+ if (i == 0 && num_files != 1) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Number of db files at "
+ "level 0 after compaction is " +
+ ToString(num_files) + ", not 1.\n");
+ return;
+ }
+ // other levels should have no file
+ if (i > 0 && num_files != 0) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Number of db files at "
+ "level " +
+ ToString(i) + " after compaction is " + ToString(num_files) +
+ ", not 0.\n");
+ return;
+ }
+ }
+
+ fprintf(stdout, "files per level after compaction: %s\n",
+ files_per_level.c_str());
+}
+
+// ----------------------------------------------------------------------------
+
+namespace {
+
+struct StdErrReporter : public log::Reader::Reporter {
+ void Corruption(size_t /*bytes*/, const Status& s) override {
+ std::cerr << "Corruption detected in log file " << s.ToString() << "\n";
+ }
+};
+
+class InMemoryHandler : public WriteBatch::Handler {
+ public:
+ InMemoryHandler(std::stringstream& row, bool print_values,
+ bool write_after_commit = false)
+ : Handler(),
+ row_(row),
+ print_values_(print_values),
+ write_after_commit_(write_after_commit) {}
+
+ void commonPutMerge(const Slice& key, const Slice& value) {
+ std::string k = LDBCommand::StringToHex(key.ToString());
+ if (print_values_) {
+ std::string v = LDBCommand::StringToHex(value.ToString());
+ row_ << k << " : ";
+ row_ << v << " ";
+ } else {
+ row_ << k << " ";
+ }
+ }
+
+ Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override {
+ row_ << "PUT(" << cf << ") : ";
+ commonPutMerge(key, value);
+ return Status::OK();
+ }
+
+ Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override {
+ row_ << "MERGE(" << cf << ") : ";
+ commonPutMerge(key, value);
+ return Status::OK();
+ }
+
+ Status MarkNoop(bool) override {
+ row_ << "NOOP ";
+ return Status::OK();
+ }
+
+ Status DeleteCF(uint32_t cf, const Slice& key) override {
+ row_ << "DELETE(" << cf << ") : ";
+ row_ << LDBCommand::StringToHex(key.ToString()) << " ";
+ return Status::OK();
+ }
+
+ Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+ row_ << "SINGLE_DELETE(" << cf << ") : ";
+ row_ << LDBCommand::StringToHex(key.ToString()) << " ";
+ return Status::OK();
+ }
+
+ Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
+ const Slice& end_key) override {
+ row_ << "DELETE_RANGE(" << cf << ") : ";
+ row_ << LDBCommand::StringToHex(begin_key.ToString()) << " ";
+ row_ << LDBCommand::StringToHex(end_key.ToString()) << " ";
+ return Status::OK();
+ }
+
+ Status MarkBeginPrepare(bool unprepare) override {
+ row_ << "BEGIN_PREPARE(";
+ row_ << (unprepare ? "true" : "false") << ") ";
+ return Status::OK();
+ }
+
+ Status MarkEndPrepare(const Slice& xid) override {
+ row_ << "END_PREPARE(";
+ row_ << LDBCommand::StringToHex(xid.ToString()) << ") ";
+ return Status::OK();
+ }
+
+ Status MarkRollback(const Slice& xid) override {
+ row_ << "ROLLBACK(";
+ row_ << LDBCommand::StringToHex(xid.ToString()) << ") ";
+ return Status::OK();
+ }
+
+ Status MarkCommit(const Slice& xid) override {
+ row_ << "COMMIT(";
+ row_ << LDBCommand::StringToHex(xid.ToString()) << ") ";
+ return Status::OK();
+ }
+
+ ~InMemoryHandler() override {}
+
+ protected:
+ bool WriteAfterCommit() const override { return write_after_commit_; }
+
+ private:
+ std::stringstream& row_;
+ bool print_values_;
+ bool write_after_commit_;
+};
+
+void DumpWalFile(Options options, std::string wal_file, bool print_header,
+ bool print_values, bool is_write_committed,
+ LDBCommandExecuteResult* exec_state) {
+ Env* env = options.env;
+ EnvOptions soptions(options);
+ std::unique_ptr<SequentialFileReader> wal_file_reader;
+
+ Status status;
+ {
+ std::unique_ptr<SequentialFile> file;
+ status = env->NewSequentialFile(wal_file, &file, soptions);
+ if (status.ok()) {
+ wal_file_reader.reset(new SequentialFileReader(
+ NewLegacySequentialFileWrapper(file), wal_file));
+ }
+ }
+ if (!status.ok()) {
+ if (exec_state) {
+ *exec_state = LDBCommandExecuteResult::Failed("Failed to open WAL file " +
+ status.ToString());
+ } else {
+ std::cerr << "Error: Failed to open WAL file " << status.ToString()
+ << std::endl;
+ }
+ } else {
+ StdErrReporter reporter;
+ uint64_t log_number;
+ FileType type;
+
+ // we need the log number, but ParseFilename expects dbname/NNN.log.
+ std::string sanitized = wal_file;
+ size_t lastslash = sanitized.rfind('/');
+ if (lastslash != std::string::npos)
+ sanitized = sanitized.substr(lastslash + 1);
+ if (!ParseFileName(sanitized, &log_number, &type)) {
+ // bogus input, carry on as best we can
+ log_number = 0;
+ }
+ log::Reader reader(options.info_log, std::move(wal_file_reader), &reporter,
+ true /* checksum */, log_number);
+ std::string scratch;
+ WriteBatch batch;
+ Slice record;
+ std::stringstream row;
+ if (print_header) {
+ std::cout << "Sequence,Count,ByteSize,Physical Offset,Key(s)";
+ if (print_values) {
+ std::cout << " : value ";
+ }
+ std::cout << "\n";
+ }
+ while (reader.ReadRecord(&record, &scratch)) {
+ row.str("");
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reporter.Corruption(record.size(),
+ Status::Corruption("log record too small"));
+ } else {
+ WriteBatchInternal::SetContents(&batch, record);
+ row << WriteBatchInternal::Sequence(&batch) << ",";
+ row << WriteBatchInternal::Count(&batch) << ",";
+ row << WriteBatchInternal::ByteSize(&batch) << ",";
+ row << reader.LastRecordOffset() << ",";
+ InMemoryHandler handler(row, print_values, is_write_committed);
+ batch.Iterate(&handler);
+ row << "\n";
+ }
+ std::cout << row.str();
+ }
+ }
+}
+
+} // namespace
+
+const std::string WALDumperCommand::ARG_WAL_FILE = "walfile";
+const std::string WALDumperCommand::ARG_WRITE_COMMITTED = "write_committed";
+const std::string WALDumperCommand::ARG_PRINT_VALUE = "print_value";
+const std::string WALDumperCommand::ARG_PRINT_HEADER = "header";
+
+WALDumperCommand::WALDumperCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, true,
+ BuildCmdLineOptions({ARG_WAL_FILE, ARG_WRITE_COMMITTED,
+ ARG_PRINT_HEADER, ARG_PRINT_VALUE})),
+ print_header_(false),
+ print_values_(false),
+ is_write_committed_(false) {
+ wal_file_.clear();
+
+ std::map<std::string, std::string>::const_iterator itr =
+ options.find(ARG_WAL_FILE);
+ if (itr != options.end()) {
+ wal_file_ = itr->second;
+ }
+
+
+ print_header_ = IsFlagPresent(flags, ARG_PRINT_HEADER);
+ print_values_ = IsFlagPresent(flags, ARG_PRINT_VALUE);
+ is_write_committed_ = ParseBooleanOption(options, ARG_WRITE_COMMITTED, true);
+
+ if (wal_file_.empty()) {
+ exec_state_ = LDBCommandExecuteResult::Failed("Argument " + ARG_WAL_FILE +
+ " must be specified.");
+ }
+}
+
+void WALDumperCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(WALDumperCommand::Name());
+ ret.append(" --" + ARG_WAL_FILE + "=<write_ahead_log_file_path>");
+ ret.append(" [--" + ARG_PRINT_HEADER + "] ");
+ ret.append(" [--" + ARG_PRINT_VALUE + "] ");
+ ret.append(" [--" + ARG_WRITE_COMMITTED + "=true|false] ");
+ ret.append("\n");
+}
+
+void WALDumperCommand::DoCommand() {
+ DumpWalFile(options_, wal_file_, print_header_, print_values_,
+ is_write_committed_, &exec_state_);
+}
+
+// ----------------------------------------------------------------------------
+
+GetCommand::GetCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(
+ options, flags, true,
+ BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
+ if (params.size() != 1) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "<key> must be specified for the get command");
+ } else {
+ key_ = params.at(0);
+ }
+
+ if (is_key_hex_) {
+ key_ = HexToString(key_);
+ }
+}
+
+void GetCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(GetCommand::Name());
+ ret.append(" <key>");
+ ret.append(" [--" + ARG_TTL + "]");
+ ret.append("\n");
+}
+
+void GetCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+ std::string value;
+ Status st = db_->Get(ReadOptions(), GetCfHandle(), key_, &value);
+ if (st.ok()) {
+ fprintf(stdout, "%s\n",
+ (is_value_hex_ ? StringToHex(value) : value).c_str());
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+ }
+}
+
+// ----------------------------------------------------------------------------
+
+ApproxSizeCommand::ApproxSizeCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, true,
+ BuildCmdLineOptions(
+ {ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO})) {
+ if (options.find(ARG_FROM) != options.end()) {
+ start_key_ = options.find(ARG_FROM)->second;
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ ARG_FROM + " must be specified for approxsize command");
+ return;
+ }
+
+ if (options.find(ARG_TO) != options.end()) {
+ end_key_ = options.find(ARG_TO)->second;
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ ARG_TO + " must be specified for approxsize command");
+ return;
+ }
+
+ if (is_key_hex_) {
+ start_key_ = HexToString(start_key_);
+ end_key_ = HexToString(end_key_);
+ }
+}
+
+void ApproxSizeCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(ApproxSizeCommand::Name());
+ ret.append(HelpRangeCmdArgs());
+ ret.append("\n");
+}
+
+void ApproxSizeCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+ Range ranges[1];
+ ranges[0] = Range(start_key_, end_key_);
+ uint64_t sizes[1];
+ db_->GetApproximateSizes(GetCfHandle(), ranges, 1, sizes);
+ fprintf(stdout, "%lu\n", (unsigned long)sizes[0]);
+ /* Weird that GetApproximateSizes() returns void, although documentation
+ * says that it returns a Status object.
+ if (!st.ok()) {
+ exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+ }
+ */
+}
+
+// ----------------------------------------------------------------------------
+
+BatchPutCommand::BatchPutCommand(
+ const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, false,
+ BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
+ ARG_VALUE_HEX, ARG_CREATE_IF_MISSING})) {
+ if (params.size() < 2) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "At least one <key> <value> pair must be specified batchput.");
+ } else if (params.size() % 2 != 0) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Equal number of <key>s and <value>s must be specified for batchput.");
+ } else {
+ for (size_t i = 0; i < params.size(); i += 2) {
+ std::string key = params.at(i);
+ std::string value = params.at(i + 1);
+ key_values_.push_back(std::pair<std::string, std::string>(
+ is_key_hex_ ? HexToString(key) : key,
+ is_value_hex_ ? HexToString(value) : value));
+ }
+ }
+ create_if_missing_ = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING);
+}
+
+void BatchPutCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(BatchPutCommand::Name());
+ ret.append(" <key> <value> [<key> <value>] [..]");
+ ret.append(" [--" + ARG_TTL + "]");
+ ret.append("\n");
+}
+
+void BatchPutCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+ WriteBatch batch;
+
+ for (std::vector<std::pair<std::string, std::string>>::const_iterator itr =
+ key_values_.begin();
+ itr != key_values_.end(); ++itr) {
+ batch.Put(GetCfHandle(), itr->first, itr->second);
+ }
+ Status st = db_->Write(WriteOptions(), &batch);
+ if (st.ok()) {
+ fprintf(stdout, "OK\n");
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+ }
+}
+
+Options BatchPutCommand::PrepareOptionsForOpenDB() {
+ Options opt = LDBCommand::PrepareOptionsForOpenDB();
+ opt.create_if_missing = create_if_missing_;
+ return opt;
+}
+
+// ----------------------------------------------------------------------------
+
+ScanCommand::ScanCommand(const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(
+ options, flags, true,
+ BuildCmdLineOptions({ARG_TTL, ARG_NO_VALUE, ARG_HEX, ARG_KEY_HEX,
+ ARG_TO, ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP,
+ ARG_MAX_KEYS, ARG_TTL_START, ARG_TTL_END})),
+ start_key_specified_(false),
+ end_key_specified_(false),
+ max_keys_scanned_(-1),
+ no_value_(false) {
+ std::map<std::string, std::string>::const_iterator itr =
+ options.find(ARG_FROM);
+ if (itr != options.end()) {
+ start_key_ = itr->second;
+ if (is_key_hex_) {
+ start_key_ = HexToString(start_key_);
+ }
+ start_key_specified_ = true;
+ }
+ itr = options.find(ARG_TO);
+ if (itr != options.end()) {
+ end_key_ = itr->second;
+ if (is_key_hex_) {
+ end_key_ = HexToString(end_key_);
+ }
+ end_key_specified_ = true;
+ }
+
+ std::vector<std::string>::const_iterator vitr =
+ std::find(flags.begin(), flags.end(), ARG_NO_VALUE);
+ if (vitr != flags.end()) {
+ no_value_ = true;
+ }
+
+ itr = options.find(ARG_MAX_KEYS);
+ if (itr != options.end()) {
+ try {
+#if defined(CYGWIN)
+ max_keys_scanned_ = strtol(itr->second.c_str(), 0, 10);
+#else
+ max_keys_scanned_ = std::stoi(itr->second);
+#endif
+ } catch (const std::invalid_argument&) {
+ exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS +
+ " has an invalid value");
+ } catch (const std::out_of_range&) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ ARG_MAX_KEYS + " has a value out-of-range");
+ }
+ }
+}
+
+void ScanCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(ScanCommand::Name());
+ ret.append(HelpRangeCmdArgs());
+ ret.append(" [--" + ARG_TTL + "]");
+ ret.append(" [--" + ARG_TIMESTAMP + "]");
+ ret.append(" [--" + ARG_MAX_KEYS + "=<N>q] ");
+ ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
+ ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
+ ret.append(" [--" + ARG_NO_VALUE + "]");
+ ret.append("\n");
+}
+
+void ScanCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+
+ int num_keys_scanned = 0;
+ ReadOptions scan_read_opts;
+ scan_read_opts.total_order_seek = true;
+ Iterator* it = db_->NewIterator(scan_read_opts, GetCfHandle());
+ if (start_key_specified_) {
+ it->Seek(start_key_);
+ } else {
+ it->SeekToFirst();
+ }
+ int ttl_start;
+ if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) {
+ ttl_start = DBWithTTLImpl::kMinTimestamp; // TTL introduction time
+ }
+ int ttl_end;
+ if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) {
+ ttl_end = DBWithTTLImpl::kMaxTimestamp; // Max time allowed by TTL feature
+ }
+ if (ttl_end < ttl_start) {
+ fprintf(stderr, "Error: End time can't be less than start time\n");
+ delete it;
+ return;
+ }
+ if (is_db_ttl_ && timestamp_) {
+ fprintf(stdout, "Scanning key-values from %s to %s\n",
+ TimeToHumanString(ttl_start).c_str(),
+ TimeToHumanString(ttl_end).c_str());
+ }
+ for ( ;
+ it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_);
+ it->Next()) {
+ if (is_db_ttl_) {
+ TtlIterator* it_ttl = static_cast_with_check<TtlIterator, Iterator>(it);
+ int rawtime = it_ttl->timestamp();
+ if (rawtime < ttl_start || rawtime >= ttl_end) {
+ continue;
+ }
+ if (timestamp_) {
+ fprintf(stdout, "%s ", TimeToHumanString(rawtime).c_str());
+ }
+ }
+
+ Slice key_slice = it->key();
+
+ std::string formatted_key;
+ if (is_key_hex_) {
+ formatted_key = "0x" + key_slice.ToString(true /* hex */);
+ key_slice = formatted_key;
+ } else if (ldb_options_.key_formatter) {
+ formatted_key = ldb_options_.key_formatter->Format(key_slice);
+ key_slice = formatted_key;
+ }
+
+ if (no_value_) {
+ fprintf(stdout, "%.*s\n", static_cast<int>(key_slice.size()),
+ key_slice.data());
+ } else {
+ Slice val_slice = it->value();
+ std::string formatted_value;
+ if (is_value_hex_) {
+ formatted_value = "0x" + val_slice.ToString(true /* hex */);
+ val_slice = formatted_value;
+ }
+ fprintf(stdout, "%.*s : %.*s\n", static_cast<int>(key_slice.size()),
+ key_slice.data(), static_cast<int>(val_slice.size()),
+ val_slice.data());
+ }
+
+ num_keys_scanned++;
+ if (max_keys_scanned_ >= 0 && num_keys_scanned >= max_keys_scanned_) {
+ break;
+ }
+ }
+ if (!it->status().ok()) { // Check for any errors found during the scan
+ exec_state_ = LDBCommandExecuteResult::Failed(it->status().ToString());
+ }
+ delete it;
+}
+
+// ----------------------------------------------------------------------------
+
+DeleteCommand::DeleteCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, false,
+ BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
+ if (params.size() != 1) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "KEY must be specified for the delete command");
+ } else {
+ key_ = params.at(0);
+ if (is_key_hex_) {
+ key_ = HexToString(key_);
+ }
+ }
+}
+
+void DeleteCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(DeleteCommand::Name() + " <key>");
+ ret.append("\n");
+}
+
+void DeleteCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+ Status st = db_->Delete(WriteOptions(), GetCfHandle(), key_);
+ if (st.ok()) {
+ fprintf(stdout, "OK\n");
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+ }
+}
+
+DeleteRangeCommand::DeleteRangeCommand(
+ const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, false,
+ BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
+ if (params.size() != 2) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "begin and end keys must be specified for the delete command");
+ } else {
+ begin_key_ = params.at(0);
+ end_key_ = params.at(1);
+ if (is_key_hex_) {
+ begin_key_ = HexToString(begin_key_);
+ end_key_ = HexToString(end_key_);
+ }
+ }
+}
+
+void DeleteRangeCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(DeleteRangeCommand::Name() + " <begin key> <end key>");
+ ret.append("\n");
+}
+
+void DeleteRangeCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+ Status st =
+ db_->DeleteRange(WriteOptions(), GetCfHandle(), begin_key_, end_key_);
+ if (st.ok()) {
+ fprintf(stdout, "OK\n");
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+ }
+}
+
+PutCommand::PutCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, false,
+ BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
+ ARG_VALUE_HEX, ARG_CREATE_IF_MISSING})) {
+ if (params.size() != 2) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "<key> and <value> must be specified for the put command");
+ } else {
+ key_ = params.at(0);
+ value_ = params.at(1);
+ }
+
+ if (is_key_hex_) {
+ key_ = HexToString(key_);
+ }
+
+ if (is_value_hex_) {
+ value_ = HexToString(value_);
+ }
+ create_if_missing_ = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING);
+}
+
+void PutCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(PutCommand::Name());
+ ret.append(" <key> <value> ");
+ ret.append(" [--" + ARG_TTL + "]");
+ ret.append("\n");
+}
+
+void PutCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+ Status st = db_->Put(WriteOptions(), GetCfHandle(), key_, value_);
+ if (st.ok()) {
+ fprintf(stdout, "OK\n");
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+ }
+}
+
+Options PutCommand::PrepareOptionsForOpenDB() {
+ Options opt = LDBCommand::PrepareOptionsForOpenDB();
+ opt.create_if_missing = create_if_missing_;
+ return opt;
+}
+
+// ----------------------------------------------------------------------------
+
+const char* DBQuerierCommand::HELP_CMD = "help";
+const char* DBQuerierCommand::GET_CMD = "get";
+const char* DBQuerierCommand::PUT_CMD = "put";
+const char* DBQuerierCommand::DELETE_CMD = "delete";
+
+DBQuerierCommand::DBQuerierCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(
+ options, flags, false,
+ BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
+
+}
+
+void DBQuerierCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(DBQuerierCommand::Name());
+ ret.append(" [--" + ARG_TTL + "]");
+ ret.append("\n");
+ ret.append(" Starts a REPL shell. Type help for list of available "
+ "commands.");
+ ret.append("\n");
+}
+
+void DBQuerierCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+
+ ReadOptions read_options;
+ WriteOptions write_options;
+
+ std::string line;
+ std::string key;
+ std::string value;
+ while (getline(std::cin, line, '\n')) {
+ // Parse line into std::vector<std::string>
+ std::vector<std::string> tokens;
+ size_t pos = 0;
+ while (true) {
+ size_t pos2 = line.find(' ', pos);
+ if (pos2 == std::string::npos) {
+ break;
+ }
+ tokens.push_back(line.substr(pos, pos2-pos));
+ pos = pos2 + 1;
+ }
+ tokens.push_back(line.substr(pos));
+
+ const std::string& cmd = tokens[0];
+
+ if (cmd == HELP_CMD) {
+ fprintf(stdout,
+ "get <key>\n"
+ "put <key> <value>\n"
+ "delete <key>\n");
+ } else if (cmd == DELETE_CMD && tokens.size() == 2) {
+ key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
+ db_->Delete(write_options, GetCfHandle(), Slice(key));
+ fprintf(stdout, "Successfully deleted %s\n", tokens[1].c_str());
+ } else if (cmd == PUT_CMD && tokens.size() == 3) {
+ key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
+ value = (is_value_hex_ ? HexToString(tokens[2]) : tokens[2]);
+ db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value));
+ fprintf(stdout, "Successfully put %s %s\n",
+ tokens[1].c_str(), tokens[2].c_str());
+ } else if (cmd == GET_CMD && tokens.size() == 2) {
+ key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
+ if (db_->Get(read_options, GetCfHandle(), Slice(key), &value).ok()) {
+ fprintf(stdout, "%s\n", PrintKeyValue(key, value,
+ is_key_hex_, is_value_hex_).c_str());
+ } else {
+ fprintf(stdout, "Not found %s\n", tokens[1].c_str());
+ }
+ } else {
+ fprintf(stdout, "Unknown command %s\n", line.c_str());
+ }
+ }
+}
+
+// ----------------------------------------------------------------------------
+
+CheckConsistencyCommand::CheckConsistencyCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {}
+
+void CheckConsistencyCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(CheckConsistencyCommand::Name());
+ ret.append("\n");
+}
+
+void CheckConsistencyCommand::DoCommand() {
+ Options opt = PrepareOptionsForOpenDB();
+ opt.paranoid_checks = true;
+ if (!exec_state_.IsNotStarted()) {
+ return;
+ }
+ DB* db;
+ Status st = DB::OpenForReadOnly(opt, db_path_, &db, false);
+ delete db;
+ if (st.ok()) {
+ fprintf(stdout, "OK\n");
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+ }
+}
+
+// ----------------------------------------------------------------------------
+
+const std::string CheckPointCommand::ARG_CHECKPOINT_DIR = "checkpoint_dir";
+
+CheckPointCommand::CheckPointCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, false /* is_read_only */,
+ BuildCmdLineOptions({ARG_CHECKPOINT_DIR})) {
+ auto itr = options.find(ARG_CHECKPOINT_DIR);
+ if (itr != options.end()) {
+ checkpoint_dir_ = itr->second;
+ }
+}
+
+void CheckPointCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(CheckPointCommand::Name());
+ ret.append(" [--" + ARG_CHECKPOINT_DIR + "] ");
+ ret.append("\n");
+}
+
+void CheckPointCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+ Checkpoint* checkpoint;
+ Status status = Checkpoint::Create(db_, &checkpoint);
+ status = checkpoint->CreateCheckpoint(checkpoint_dir_);
+ if (status.ok()) {
+ fprintf(stdout, "OK\n");
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(status.ToString());
+ }
+}
+
+// ----------------------------------------------------------------------------
+
+RepairCommand::RepairCommand(const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {}
+
+void RepairCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(RepairCommand::Name());
+ ret.append("\n");
+}
+
+void RepairCommand::DoCommand() {
+ Options options = PrepareOptionsForOpenDB();
+ options.info_log.reset(new StderrLogger(InfoLogLevel::WARN_LEVEL));
+ Status status = RepairDB(db_path_, options);
+ if (status.ok()) {
+ fprintf(stdout, "OK\n");
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(status.ToString());
+ }
+}
+
+// ----------------------------------------------------------------------------
+
+const std::string BackupableCommand::ARG_NUM_THREADS = "num_threads";
+const std::string BackupableCommand::ARG_BACKUP_ENV_URI = "backup_env_uri";
+const std::string BackupableCommand::ARG_BACKUP_DIR = "backup_dir";
+const std::string BackupableCommand::ARG_STDERR_LOG_LEVEL = "stderr_log_level";
+
+BackupableCommand::BackupableCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, false /* is_read_only */,
+ BuildCmdLineOptions({ARG_BACKUP_ENV_URI, ARG_BACKUP_DIR,
+ ARG_NUM_THREADS, ARG_STDERR_LOG_LEVEL})),
+ num_threads_(1) {
+ auto itr = options.find(ARG_NUM_THREADS);
+ if (itr != options.end()) {
+ num_threads_ = std::stoi(itr->second);
+ }
+ itr = options.find(ARG_BACKUP_ENV_URI);
+ if (itr != options.end()) {
+ backup_env_uri_ = itr->second;
+ }
+ itr = options.find(ARG_BACKUP_DIR);
+ if (itr == options.end()) {
+ exec_state_ = LDBCommandExecuteResult::Failed("--" + ARG_BACKUP_DIR +
+ ": missing backup directory");
+ } else {
+ backup_dir_ = itr->second;
+ }
+
+ itr = options.find(ARG_STDERR_LOG_LEVEL);
+ if (itr != options.end()) {
+ int stderr_log_level = std::stoi(itr->second);
+ if (stderr_log_level < 0 ||
+ stderr_log_level >= InfoLogLevel::NUM_INFO_LOG_LEVELS) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ ARG_STDERR_LOG_LEVEL + " must be >= 0 and < " +
+ std::to_string(InfoLogLevel::NUM_INFO_LOG_LEVELS) + ".");
+ } else {
+ logger_.reset(
+ new StderrLogger(static_cast<InfoLogLevel>(stderr_log_level)));
+ }
+ }
+}
+
+void BackupableCommand::Help(const std::string& name, std::string& ret) {
+ ret.append(" ");
+ ret.append(name);
+ ret.append(" [--" + ARG_BACKUP_ENV_URI + "] ");
+ ret.append(" [--" + ARG_BACKUP_DIR + "] ");
+ ret.append(" [--" + ARG_NUM_THREADS + "] ");
+ ret.append(" [--" + ARG_STDERR_LOG_LEVEL + "=<int (InfoLogLevel)>] ");
+ ret.append("\n");
+}
+
+// ----------------------------------------------------------------------------
+
+BackupCommand::BackupCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : BackupableCommand(params, options, flags) {}
+
+void BackupCommand::Help(std::string& ret) {
+ BackupableCommand::Help(Name(), ret);
+}
+
+void BackupCommand::DoCommand() {
+ BackupEngine* backup_engine;
+ Status status;
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+ fprintf(stdout, "open db OK\n");
+ Env* custom_env = nullptr;
+ Env::LoadEnv(backup_env_uri_, &custom_env, &backup_env_guard_);
+ assert(custom_env != nullptr);
+
+ BackupableDBOptions backup_options =
+ BackupableDBOptions(backup_dir_, custom_env);
+ backup_options.info_log = logger_.get();
+ backup_options.max_background_operations = num_threads_;
+ status = BackupEngine::Open(custom_env, backup_options, &backup_engine);
+ if (status.ok()) {
+ fprintf(stdout, "open backup engine OK\n");
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(status.ToString());
+ return;
+ }
+ status = backup_engine->CreateNewBackup(db_);
+ if (status.ok()) {
+ fprintf(stdout, "create new backup OK\n");
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(status.ToString());
+ return;
+ }
+}
+
+// ----------------------------------------------------------------------------
+
+RestoreCommand::RestoreCommand(
+ const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : BackupableCommand(params, options, flags) {}
+
+void RestoreCommand::Help(std::string& ret) {
+ BackupableCommand::Help(Name(), ret);
+}
+
+void RestoreCommand::DoCommand() {
+ Env* custom_env = nullptr;
+ Env::LoadEnv(backup_env_uri_, &custom_env, &backup_env_guard_);
+ assert(custom_env != nullptr);
+
+ std::unique_ptr<BackupEngineReadOnly> restore_engine;
+ Status status;
+ {
+ BackupableDBOptions opts(backup_dir_, custom_env);
+ opts.info_log = logger_.get();
+ opts.max_background_operations = num_threads_;
+ BackupEngineReadOnly* raw_restore_engine_ptr;
+ status =
+ BackupEngineReadOnly::Open(custom_env, opts, &raw_restore_engine_ptr);
+ if (status.ok()) {
+ restore_engine.reset(raw_restore_engine_ptr);
+ }
+ }
+ if (status.ok()) {
+ fprintf(stdout, "open restore engine OK\n");
+ status = restore_engine->RestoreDBFromLatestBackup(db_path_, db_path_);
+ }
+ if (status.ok()) {
+ fprintf(stdout, "restore from backup OK\n");
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(status.ToString());
+ }
+}
+
+// ----------------------------------------------------------------------------
+
+namespace {
+
+void DumpSstFile(Options options, std::string filename, bool output_hex,
+ bool show_properties) {
+ std::string from_key;
+ std::string to_key;
+ if (filename.length() <= 4 ||
+ filename.rfind(".sst") != filename.length() - 4) {
+ std::cout << "Invalid sst file name." << std::endl;
+ return;
+ }
+ // no verification
+ // TODO: add support for decoding blob indexes in ldb as well
+ ROCKSDB_NAMESPACE::SstFileDumper dumper(
+ options, filename, /* verify_checksum */ false, output_hex,
+ /* decode_blob_index */ false);
+ Status st = dumper.ReadSequential(true, std::numeric_limits<uint64_t>::max(),
+ false, // has_from
+ from_key, false, // has_to
+ to_key);
+ if (!st.ok()) {
+ std::cerr << "Error in reading SST file " << filename << st.ToString()
+ << std::endl;
+ return;
+ }
+
+ if (show_properties) {
+ const ROCKSDB_NAMESPACE::TableProperties* table_properties;
+
+ std::shared_ptr<const ROCKSDB_NAMESPACE::TableProperties>
+ table_properties_from_reader;
+ st = dumper.ReadTableProperties(&table_properties_from_reader);
+ if (!st.ok()) {
+ std::cerr << filename << ": " << st.ToString()
+ << ". Try to use initial table properties" << std::endl;
+ table_properties = dumper.GetInitTableProperties();
+ } else {
+ table_properties = table_properties_from_reader.get();
+ }
+ if (table_properties != nullptr) {
+ std::cout << std::endl << "Table Properties:" << std::endl;
+ std::cout << table_properties->ToString("\n") << std::endl;
+ }
+ }
+}
+
+} // namespace
+
+DBFileDumperCommand::DBFileDumperCommand(
+ const std::vector<std::string>& /*params*/,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {}
+
+void DBFileDumperCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(DBFileDumperCommand::Name());
+ ret.append("\n");
+}
+
+void DBFileDumperCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+ Status s;
+
+ std::cout << "Manifest File" << std::endl;
+ std::cout << "==============================" << std::endl;
+ std::string manifest_filename;
+ s = ReadFileToString(db_->GetEnv(), CurrentFileName(db_->GetName()),
+ &manifest_filename);
+ if (!s.ok() || manifest_filename.empty() ||
+ manifest_filename.back() != '\n') {
+ std::cerr << "Error when reading CURRENT file "
+ << CurrentFileName(db_->GetName()) << std::endl;
+ }
+ // remove the trailing '\n'
+ manifest_filename.resize(manifest_filename.size() - 1);
+ std::string manifest_filepath = db_->GetName() + "/" + manifest_filename;
+ std::cout << manifest_filepath << std::endl;
+ DumpManifestFile(options_, manifest_filepath, false, false, false);
+ std::cout << std::endl;
+
+ std::cout << "SST Files" << std::endl;
+ std::cout << "==============================" << std::endl;
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ for (auto& fileMetadata : metadata) {
+ std::string filename = fileMetadata.db_path + fileMetadata.name;
+ std::cout << filename << " level:" << fileMetadata.level << std::endl;
+ std::cout << "------------------------------" << std::endl;
+ DumpSstFile(options_, filename, false, true);
+ std::cout << std::endl;
+ }
+ std::cout << std::endl;
+
+ std::cout << "Write Ahead Log Files" << std::endl;
+ std::cout << "==============================" << std::endl;
+ ROCKSDB_NAMESPACE::VectorLogPtr wal_files;
+ s = db_->GetSortedWalFiles(wal_files);
+ if (!s.ok()) {
+ std::cerr << "Error when getting WAL files" << std::endl;
+ } else {
+ for (auto& wal : wal_files) {
+ // TODO(qyang): option.wal_dir should be passed into ldb command
+ std::string filename = db_->GetOptions().wal_dir + wal->PathName();
+ std::cout << filename << std::endl;
+ // TODO(myabandeh): allow configuring is_write_commited
+ DumpWalFile(options_, filename, true, true, true /* is_write_commited */,
+ &exec_state_);
+ }
+ }
+}
+
+void WriteExternalSstFilesCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(WriteExternalSstFilesCommand::Name());
+ ret.append(" <output_sst_path>");
+ ret.append("\n");
+}
+
+WriteExternalSstFilesCommand::WriteExternalSstFilesCommand(
+ const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(
+ options, flags, false /* is_read_only */,
+ BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM,
+ ARG_TO, ARG_CREATE_IF_MISSING})) {
+ create_if_missing_ =
+ IsFlagPresent(flags, ARG_CREATE_IF_MISSING) ||
+ ParseBooleanOption(options, ARG_CREATE_IF_MISSING, false);
+ if (params.size() != 1) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "output SST file path must be specified");
+ } else {
+ output_sst_path_ = params.at(0);
+ }
+}
+
+void WriteExternalSstFilesCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+ ColumnFamilyHandle* cfh = GetCfHandle();
+ SstFileWriter sst_file_writer(EnvOptions(), db_->GetOptions(), cfh);
+ Status status = sst_file_writer.Open(output_sst_path_);
+ if (!status.ok()) {
+ exec_state_ = LDBCommandExecuteResult::Failed("failed to open SST file: " +
+ status.ToString());
+ return;
+ }
+
+ int bad_lines = 0;
+ std::string line;
+ std::ifstream ifs_stdin("/dev/stdin");
+ std::istream* istream_p = ifs_stdin.is_open() ? &ifs_stdin : &std::cin;
+ while (getline(*istream_p, line, '\n')) {
+ std::string key;
+ std::string value;
+ if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) {
+ status = sst_file_writer.Put(key, value);
+ if (!status.ok()) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "failed to write record to file: " + status.ToString());
+ return;
+ }
+ } else if (0 == line.find("Keys in range:")) {
+ // ignore this line
+ } else if (0 == line.find("Created bg thread 0x")) {
+ // ignore this line
+ } else {
+ bad_lines++;
+ }
+ }
+
+ status = sst_file_writer.Finish();
+ if (!status.ok()) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "Failed to finish writing to file: " + status.ToString());
+ return;
+ }
+
+ if (bad_lines > 0) {
+ fprintf(stderr, "Warning: %d bad lines ignored.\n", bad_lines);
+ }
+ exec_state_ = LDBCommandExecuteResult::Succeed(
+ "external SST file written to " + output_sst_path_);
+}
+
+Options WriteExternalSstFilesCommand::PrepareOptionsForOpenDB() {
+ Options opt = LDBCommand::PrepareOptionsForOpenDB();
+ opt.create_if_missing = create_if_missing_;
+ return opt;
+}
+
+const std::string IngestExternalSstFilesCommand::ARG_MOVE_FILES = "move_files";
+const std::string IngestExternalSstFilesCommand::ARG_SNAPSHOT_CONSISTENCY =
+ "snapshot_consistency";
+const std::string IngestExternalSstFilesCommand::ARG_ALLOW_GLOBAL_SEQNO =
+ "allow_global_seqno";
+const std::string IngestExternalSstFilesCommand::ARG_ALLOW_BLOCKING_FLUSH =
+ "allow_blocking_flush";
+const std::string IngestExternalSstFilesCommand::ARG_INGEST_BEHIND =
+ "ingest_behind";
+const std::string IngestExternalSstFilesCommand::ARG_WRITE_GLOBAL_SEQNO =
+ "write_global_seqno";
+
+void IngestExternalSstFilesCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(IngestExternalSstFilesCommand::Name());
+ ret.append(" <input_sst_path>");
+ ret.append(" [--" + ARG_MOVE_FILES + "] ");
+ ret.append(" [--" + ARG_SNAPSHOT_CONSISTENCY + "] ");
+ ret.append(" [--" + ARG_ALLOW_GLOBAL_SEQNO + "] ");
+ ret.append(" [--" + ARG_ALLOW_BLOCKING_FLUSH + "] ");
+ ret.append(" [--" + ARG_INGEST_BEHIND + "] ");
+ ret.append(" [--" + ARG_WRITE_GLOBAL_SEQNO + "] ");
+ ret.append("\n");
+}
+
+IngestExternalSstFilesCommand::IngestExternalSstFilesCommand(
+ const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(
+ options, flags, false /* is_read_only */,
+ BuildCmdLineOptions({ARG_MOVE_FILES, ARG_SNAPSHOT_CONSISTENCY,
+ ARG_ALLOW_GLOBAL_SEQNO, ARG_CREATE_IF_MISSING,
+ ARG_ALLOW_BLOCKING_FLUSH, ARG_INGEST_BEHIND,
+ ARG_WRITE_GLOBAL_SEQNO})),
+ move_files_(false),
+ snapshot_consistency_(true),
+ allow_global_seqno_(true),
+ allow_blocking_flush_(true),
+ ingest_behind_(false),
+ write_global_seqno_(true) {
+ create_if_missing_ =
+ IsFlagPresent(flags, ARG_CREATE_IF_MISSING) ||
+ ParseBooleanOption(options, ARG_CREATE_IF_MISSING, false);
+ move_files_ = IsFlagPresent(flags, ARG_MOVE_FILES) ||
+ ParseBooleanOption(options, ARG_MOVE_FILES, false);
+ snapshot_consistency_ =
+ IsFlagPresent(flags, ARG_SNAPSHOT_CONSISTENCY) ||
+ ParseBooleanOption(options, ARG_SNAPSHOT_CONSISTENCY, true);
+ allow_global_seqno_ =
+ IsFlagPresent(flags, ARG_ALLOW_GLOBAL_SEQNO) ||
+ ParseBooleanOption(options, ARG_ALLOW_GLOBAL_SEQNO, true);
+ allow_blocking_flush_ =
+ IsFlagPresent(flags, ARG_ALLOW_BLOCKING_FLUSH) ||
+ ParseBooleanOption(options, ARG_ALLOW_BLOCKING_FLUSH, true);
+ ingest_behind_ = IsFlagPresent(flags, ARG_INGEST_BEHIND) ||
+ ParseBooleanOption(options, ARG_INGEST_BEHIND, false);
+ write_global_seqno_ =
+ IsFlagPresent(flags, ARG_WRITE_GLOBAL_SEQNO) ||
+ ParseBooleanOption(options, ARG_WRITE_GLOBAL_SEQNO, true);
+
+ if (allow_global_seqno_) {
+ if (!write_global_seqno_) {
+ fprintf(stderr,
+ "Warning: not writing global_seqno to the ingested SST can\n"
+ "prevent older versions of RocksDB from being able to open it\n");
+ }
+ } else {
+ if (write_global_seqno_) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "ldb cannot write global_seqno to the ingested SST when global_seqno "
+ "is not allowed");
+ }
+ }
+
+ if (params.size() != 1) {
+ exec_state_ =
+ LDBCommandExecuteResult::Failed("input SST path must be specified");
+ } else {
+ input_sst_path_ = params.at(0);
+ }
+}
+
+void IngestExternalSstFilesCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+ if (GetExecuteState().IsFailed()) {
+ return;
+ }
+ ColumnFamilyHandle* cfh = GetCfHandle();
+ IngestExternalFileOptions ifo;
+ ifo.move_files = move_files_;
+ ifo.snapshot_consistency = snapshot_consistency_;
+ ifo.allow_global_seqno = allow_global_seqno_;
+ ifo.allow_blocking_flush = allow_blocking_flush_;
+ ifo.ingest_behind = ingest_behind_;
+ ifo.write_global_seqno = write_global_seqno_;
+ Status status = db_->IngestExternalFile(cfh, {input_sst_path_}, ifo);
+ if (!status.ok()) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ "failed to ingest external SST: " + status.ToString());
+ } else {
+ exec_state_ =
+ LDBCommandExecuteResult::Succeed("external SST files ingested");
+ }
+}
+
+Options IngestExternalSstFilesCommand::PrepareOptionsForOpenDB() {
+ Options opt = LDBCommand::PrepareOptionsForOpenDB();
+ opt.create_if_missing = create_if_missing_;
+ return opt;
+}
+
+ListFileRangeDeletesCommand::ListFileRangeDeletesCommand(
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags)
+ : LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_MAX_KEYS})) {
+ std::map<std::string, std::string>::const_iterator itr =
+ options.find(ARG_MAX_KEYS);
+ if (itr != options.end()) {
+ try {
+#if defined(CYGWIN)
+ max_keys_ = strtol(itr->second.c_str(), 0, 10);
+#else
+ max_keys_ = std::stoi(itr->second);
+#endif
+ } catch (const std::invalid_argument&) {
+ exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS +
+ " has an invalid value");
+ } catch (const std::out_of_range&) {
+ exec_state_ = LDBCommandExecuteResult::Failed(
+ ARG_MAX_KEYS + " has a value out-of-range");
+ }
+ }
+}
+
+void ListFileRangeDeletesCommand::Help(std::string& ret) {
+ ret.append(" ");
+ ret.append(ListFileRangeDeletesCommand::Name());
+ ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+ ret.append(" : print tombstones in SST files.\n");
+}
+
+void ListFileRangeDeletesCommand::DoCommand() {
+ if (!db_) {
+ assert(GetExecuteState().IsFailed());
+ return;
+ }
+
+ DBImpl* db_impl = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
+
+ std::string out_str;
+
+ Status st =
+ db_impl->TablesRangeTombstoneSummary(GetCfHandle(), max_keys_, &out_str);
+ if (st.ok()) {
+ TEST_SYNC_POINT_CALLBACK(
+ "ListFileRangeDeletesCommand::DoCommand:BeforePrint", &out_str);
+ fprintf(stdout, "%s\n", out_str.c_str());
+ } else {
+ exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/ldb_cmd_impl.h b/src/rocksdb/tools/ldb_cmd_impl.h
new file mode 100644
index 000000000..8477cae6f
--- /dev/null
+++ b/src/rocksdb/tools/ldb_cmd_impl.h
@@ -0,0 +1,628 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/utilities/ldb_cmd.h"
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactorCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "compact"; }
+
+ CompactorCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ static void Help(std::string& ret);
+
+ virtual void DoCommand() override;
+
+ private:
+ bool null_from_;
+ std::string from_;
+ bool null_to_;
+ std::string to_;
+};
+
+class DBFileDumperCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "dump_live_files"; }
+
+ DBFileDumperCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ static void Help(std::string& ret);
+
+ virtual void DoCommand() override;
+};
+
+class DBDumperCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "dump"; }
+
+ DBDumperCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ static void Help(std::string& ret);
+
+ virtual void DoCommand() override;
+
+ private:
+ /**
+ * Extract file name from the full path. We handle both the forward slash (/)
+ * and backslash (\) to make sure that different OS-s are supported.
+ */
+ static std::string GetFileNameFromPath(const std::string& s) {
+ std::size_t n = s.find_last_of("/\\");
+
+ if (std::string::npos == n) {
+ return s;
+ } else {
+ return s.substr(n + 1);
+ }
+ }
+
+ void DoDumpCommand();
+
+ bool null_from_;
+ std::string from_;
+ bool null_to_;
+ std::string to_;
+ int max_keys_;
+ std::string delim_;
+ bool count_only_;
+ bool count_delim_;
+ bool print_stats_;
+ std::string path_;
+
+ static const std::string ARG_COUNT_ONLY;
+ static const std::string ARG_COUNT_DELIM;
+ static const std::string ARG_STATS;
+ static const std::string ARG_TTL_BUCKET;
+};
+
+class InternalDumpCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "idump"; }
+
+ InternalDumpCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ static void Help(std::string& ret);
+
+ virtual void DoCommand() override;
+
+ private:
+ bool has_from_;
+ std::string from_;
+ bool has_to_;
+ std::string to_;
+ int max_keys_;
+ std::string delim_;
+ bool count_only_;
+ bool count_delim_;
+ bool print_stats_;
+ bool is_input_key_hex_;
+
+ static const std::string ARG_DELIM;
+ static const std::string ARG_COUNT_ONLY;
+ static const std::string ARG_COUNT_DELIM;
+ static const std::string ARG_STATS;
+ static const std::string ARG_INPUT_KEY_HEX;
+};
+
+class DBLoaderCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "load"; }
+
+ DBLoaderCommand(std::string& db_name, std::vector<std::string>& args);
+
+ DBLoaderCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ static void Help(std::string& ret);
+ virtual void DoCommand() override;
+
+ virtual Options PrepareOptionsForOpenDB() override;
+
+ private:
+ bool disable_wal_;
+ bool bulk_load_;
+ bool compact_;
+
+ static const std::string ARG_DISABLE_WAL;
+ static const std::string ARG_BULK_LOAD;
+ static const std::string ARG_COMPACT;
+};
+
+class ManifestDumpCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "manifest_dump"; }
+
+ ManifestDumpCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ static void Help(std::string& ret);
+ virtual void DoCommand() override;
+
+ virtual bool NoDBOpen() override { return true; }
+
+ private:
+ bool verbose_;
+ bool json_;
+ std::string path_;
+
+ static const std::string ARG_VERBOSE;
+ static const std::string ARG_JSON;
+ static const std::string ARG_PATH;
+};
+
+class FileChecksumDumpCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "file_checksum_dump"; }
+
+ FileChecksumDumpCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ static void Help(std::string& ret);
+ void DoCommand() override;
+
+ bool NoDBOpen() override { return true; }
+
+ private:
+ std::string path_;
+
+ static const std::string ARG_PATH;
+};
+
+class ListColumnFamiliesCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "list_column_families"; }
+
+ ListColumnFamiliesCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ static void Help(std::string& ret);
+ virtual void DoCommand() override;
+
+ virtual bool NoDBOpen() override { return true; }
+};
+
+class CreateColumnFamilyCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "create_column_family"; }
+
+ CreateColumnFamilyCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ static void Help(std::string& ret);
+ virtual void DoCommand() override;
+
+ virtual bool NoDBOpen() override { return false; }
+
+ private:
+ std::string new_cf_name_;
+};
+
+class DropColumnFamilyCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "drop_column_family"; }
+
+ DropColumnFamilyCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ static void Help(std::string& ret);
+ virtual void DoCommand() override;
+
+ virtual bool NoDBOpen() override { return false; }
+
+ private:
+ std::string cf_name_to_drop_;
+};
+
+class ReduceDBLevelsCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "reduce_levels"; }
+
+ ReduceDBLevelsCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual Options PrepareOptionsForOpenDB() override;
+
+ virtual void DoCommand() override;
+
+ virtual bool NoDBOpen() override { return true; }
+
+ static void Help(std::string& msg);
+
+ static std::vector<std::string> PrepareArgs(const std::string& db_path,
+ int new_levels,
+ bool print_old_level = false);
+
+ private:
+ int old_levels_;
+ int new_levels_;
+ bool print_old_levels_;
+
+ static const std::string ARG_NEW_LEVELS;
+ static const std::string ARG_PRINT_OLD_LEVELS;
+
+ Status GetOldNumOfLevels(Options& opt, int* levels);
+};
+
+class ChangeCompactionStyleCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "change_compaction_style"; }
+
+ ChangeCompactionStyleCommand(
+ const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual Options PrepareOptionsForOpenDB() override;
+
+ virtual void DoCommand() override;
+
+ static void Help(std::string& msg);
+
+ private:
+ int old_compaction_style_;
+ int new_compaction_style_;
+
+ static const std::string ARG_OLD_COMPACTION_STYLE;
+ static const std::string ARG_NEW_COMPACTION_STYLE;
+};
+
+class WALDumperCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "dump_wal"; }
+
+ WALDumperCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual bool NoDBOpen() override { return true; }
+
+ static void Help(std::string& ret);
+ virtual void DoCommand() override;
+
+ private:
+ bool print_header_;
+ std::string wal_file_;
+ bool print_values_;
+ bool is_write_committed_; // default will be set to true
+
+ static const std::string ARG_WAL_FILE;
+ static const std::string ARG_WRITE_COMMITTED;
+ static const std::string ARG_PRINT_HEADER;
+ static const std::string ARG_PRINT_VALUE;
+};
+
+class GetCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "get"; }
+
+ GetCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual void DoCommand() override;
+
+ static void Help(std::string& ret);
+
+ private:
+ std::string key_;
+};
+
+class ApproxSizeCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "approxsize"; }
+
+ ApproxSizeCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual void DoCommand() override;
+
+ static void Help(std::string& ret);
+
+ private:
+ std::string start_key_;
+ std::string end_key_;
+};
+
+class BatchPutCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "batchput"; }
+
+ BatchPutCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual void DoCommand() override;
+
+ static void Help(std::string& ret);
+
+ virtual Options PrepareOptionsForOpenDB() override;
+
+ private:
+ /**
+ * The key-values to be inserted.
+ */
+ std::vector<std::pair<std::string, std::string>> key_values_;
+};
+
+class ScanCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "scan"; }
+
+ ScanCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual void DoCommand() override;
+
+ static void Help(std::string& ret);
+
+ private:
+ std::string start_key_;
+ std::string end_key_;
+ bool start_key_specified_;
+ bool end_key_specified_;
+ int max_keys_scanned_;
+ bool no_value_;
+};
+
+class DeleteCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "delete"; }
+
+ DeleteCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual void DoCommand() override;
+
+ static void Help(std::string& ret);
+
+ private:
+ std::string key_;
+};
+
+class DeleteRangeCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "deleterange"; }
+
+ DeleteRangeCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual void DoCommand() override;
+
+ static void Help(std::string& ret);
+
+ private:
+ std::string begin_key_;
+ std::string end_key_;
+};
+
+class PutCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "put"; }
+
+ PutCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual void DoCommand() override;
+
+ static void Help(std::string& ret);
+
+ virtual Options PrepareOptionsForOpenDB() override;
+
+ private:
+ std::string key_;
+ std::string value_;
+};
+
+/**
+ * Command that starts up a REPL shell that allows
+ * get/put/delete.
+ */
+class DBQuerierCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "query"; }
+
+ DBQuerierCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ static void Help(std::string& ret);
+
+ virtual void DoCommand() override;
+
+ private:
+ static const char* HELP_CMD;
+ static const char* GET_CMD;
+ static const char* PUT_CMD;
+ static const char* DELETE_CMD;
+};
+
+class CheckConsistencyCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "checkconsistency"; }
+
+ CheckConsistencyCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual void DoCommand() override;
+
+ virtual bool NoDBOpen() override { return true; }
+
+ static void Help(std::string& ret);
+};
+
+class CheckPointCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "checkpoint"; }
+
+ CheckPointCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual void DoCommand() override;
+
+ static void Help(std::string& ret);
+
+ std::string checkpoint_dir_;
+ private:
+ static const std::string ARG_CHECKPOINT_DIR;
+};
+
+class RepairCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "repair"; }
+
+ RepairCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual void DoCommand() override;
+
+ virtual bool NoDBOpen() override { return true; }
+
+ static void Help(std::string& ret);
+};
+
+class BackupableCommand : public LDBCommand {
+ public:
+ BackupableCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ protected:
+ static void Help(const std::string& name, std::string& ret);
+ std::string backup_env_uri_;
+ std::string backup_dir_;
+ int num_threads_;
+ std::unique_ptr<Logger> logger_;
+ std::shared_ptr<Env> backup_env_guard_;
+
+ private:
+ static const std::string ARG_BACKUP_DIR;
+ static const std::string ARG_BACKUP_ENV_URI;
+ static const std::string ARG_NUM_THREADS;
+ static const std::string ARG_STDERR_LOG_LEVEL;
+};
+
+class BackupCommand : public BackupableCommand {
+ public:
+ static std::string Name() { return "backup"; }
+ BackupCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+ virtual void DoCommand() override;
+ static void Help(std::string& ret);
+};
+
+class RestoreCommand : public BackupableCommand {
+ public:
+ static std::string Name() { return "restore"; }
+ RestoreCommand(const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+ virtual void DoCommand() override;
+ virtual bool NoDBOpen() override { return true; }
+ static void Help(std::string& ret);
+};
+
+class WriteExternalSstFilesCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "write_extern_sst"; }
+ WriteExternalSstFilesCommand(
+ const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual void DoCommand() override;
+
+ virtual bool NoDBOpen() override { return false; }
+
+ virtual Options PrepareOptionsForOpenDB() override;
+
+ static void Help(std::string& ret);
+
+ private:
+ std::string output_sst_path_;
+};
+
+class IngestExternalSstFilesCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "ingest_extern_sst"; }
+ IngestExternalSstFilesCommand(
+ const std::vector<std::string>& params,
+ const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ virtual void DoCommand() override;
+
+ virtual bool NoDBOpen() override { return false; }
+
+ virtual Options PrepareOptionsForOpenDB() override;
+
+ static void Help(std::string& ret);
+
+ private:
+ std::string input_sst_path_;
+ bool move_files_;
+ bool snapshot_consistency_;
+ bool allow_global_seqno_;
+ bool allow_blocking_flush_;
+ bool ingest_behind_;
+ bool write_global_seqno_;
+
+ static const std::string ARG_MOVE_FILES;
+ static const std::string ARG_SNAPSHOT_CONSISTENCY;
+ static const std::string ARG_ALLOW_GLOBAL_SEQNO;
+ static const std::string ARG_ALLOW_BLOCKING_FLUSH;
+ static const std::string ARG_INGEST_BEHIND;
+ static const std::string ARG_WRITE_GLOBAL_SEQNO;
+};
+
+// Command that prints out range delete tombstones in SST files.
+class ListFileRangeDeletesCommand : public LDBCommand {
+ public:
+ static std::string Name() { return "list_file_range_deletes"; }
+
+ ListFileRangeDeletesCommand(const std::map<std::string, std::string>& options,
+ const std::vector<std::string>& flags);
+
+ void DoCommand() override;
+
+ static void Help(std::string& ret);
+
+ private:
+ int max_keys_ = 1000;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/tools/ldb_cmd_test.cc b/src/rocksdb/tools/ldb_cmd_test.cc
new file mode 100644
index 000000000..80e2f0d1f
--- /dev/null
+++ b/src/rocksdb/tools/ldb_cmd_test.cc
@@ -0,0 +1,585 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "env/composite_env_wrapper.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/file_checksum.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/file_checksum_helper.h"
+
+using std::string;
+using std::vector;
+using std::map;
+
+namespace ROCKSDB_NAMESPACE {
+
+class LdbCmdTest : public testing::Test {
+ public:
+ LdbCmdTest() : testing::Test() {}
+
+ Env* TryLoadCustomOrDefaultEnv() {
+ const char* test_env_uri = getenv("TEST_ENV_URI");
+ if (!test_env_uri) {
+ return Env::Default();
+ }
+ Env* env = Env::Default();
+ Env::LoadEnv(test_env_uri, &env, &env_guard_);
+ return env;
+ }
+
+ private:
+ std::shared_ptr<Env> env_guard_;
+};
+
+TEST_F(LdbCmdTest, HexToString) {
+ // map input to expected outputs.
+ // odd number of "hex" half bytes doesn't make sense
+ map<string, vector<int>> inputMap = {
+ {"0x07", {7}}, {"0x5050", {80, 80}}, {"0xFF", {-1}},
+ {"0x1234", {18, 52}}, {"0xaaAbAC", {-86, -85, -84}}, {"0x1203", {18, 3}},
+ };
+
+ for (const auto& inPair : inputMap) {
+ auto actual = ROCKSDB_NAMESPACE::LDBCommand::HexToString(inPair.first);
+ auto expected = inPair.second;
+ for (unsigned int i = 0; i < actual.length(); i++) {
+ EXPECT_EQ(expected[i], static_cast<int>((signed char) actual[i]));
+ }
+ auto reverse = ROCKSDB_NAMESPACE::LDBCommand::StringToHex(actual);
+ EXPECT_STRCASEEQ(inPair.first.c_str(), reverse.c_str());
+ }
+}
+
+TEST_F(LdbCmdTest, HexToStringBadInputs) {
+ const vector<string> badInputs = {
+ "0xZZ", "123", "0xx5", "0x111G", "0x123", "Ox12", "0xT", "0x1Q1",
+ };
+ for (const auto badInput : badInputs) {
+ try {
+ ROCKSDB_NAMESPACE::LDBCommand::HexToString(badInput);
+ std::cerr << "Should fail on bad hex value: " << badInput << "\n";
+ FAIL();
+ } catch (...) {
+ }
+ }
+}
+
+TEST_F(LdbCmdTest, MemEnv) {
+ Env* base_env = TryLoadCustomOrDefaultEnv();
+ std::unique_ptr<Env> env(NewMemEnv(base_env));
+ Options opts;
+ opts.env = env.get();
+ opts.create_if_missing = true;
+
+ opts.file_system.reset(new LegacyFileSystemWrapper(opts.env));
+
+ DB* db = nullptr;
+ std::string dbname = test::TmpDir();
+ ASSERT_OK(DB::Open(opts, dbname, &db));
+
+ WriteOptions wopts;
+ for (int i = 0; i < 100; i++) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%08d", i);
+ ASSERT_OK(db->Put(wopts, buf, buf));
+ }
+ FlushOptions fopts;
+ fopts.wait = true;
+ ASSERT_OK(db->Flush(fopts));
+
+ delete db;
+
+ char arg1[] = "./ldb";
+ char arg2[1024];
+ snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+ char arg3[] = "dump_live_files";
+ char* argv[] = {arg1, arg2, arg3};
+
+ ASSERT_EQ(0,
+ LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+}
+
+class FileChecksumTestHelper {
+ private:
+ Options options_;
+ DB* db_;
+ std::string dbname_;
+
+ Status VerifyChecksum(LiveFileMetaData& file_meta) {
+ std::string cur_checksum;
+ std::string checksum_func_name;
+
+ Status s;
+ EnvOptions soptions;
+ std::unique_ptr<SequentialFile> file_reader;
+ std::string file_path = dbname_ + "/" + file_meta.name;
+ s = options_.env->NewSequentialFile(file_path, &file_reader, soptions);
+ if (!s.ok()) {
+ return s;
+ }
+ std::unique_ptr<char[]> scratch(new char[2048]);
+ bool first_read = true;
+ Slice result;
+ FileChecksumFunc* file_checksum_func =
+ options_.sst_file_checksum_func.get();
+ if (file_checksum_func == nullptr) {
+ cur_checksum = kUnknownFileChecksum;
+ checksum_func_name = kUnknownFileChecksumFuncName;
+ } else {
+ checksum_func_name = file_checksum_func->Name();
+ s = file_reader->Read(2048, &result, scratch.get());
+ if (!s.ok()) {
+ return s;
+ }
+ while (result.size() != 0) {
+ if (first_read) {
+ first_read = false;
+ cur_checksum =
+ file_checksum_func->Value(scratch.get(), result.size());
+ } else {
+ cur_checksum = file_checksum_func->Extend(cur_checksum, scratch.get(),
+ result.size());
+ }
+ s = file_reader->Read(2048, &result, scratch.get());
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+
+ std::string stored_checksum = file_meta.file_checksum;
+ std::string stored_checksum_func_name = file_meta.file_checksum_func_name;
+ if ((cur_checksum != stored_checksum) ||
+ (checksum_func_name != stored_checksum_func_name)) {
+ return Status::Corruption(
+ "Checksum does not match! The file: " + file_meta.name +
+ ", checksum name: " + stored_checksum_func_name + " and checksum " +
+ stored_checksum + ". However, expected checksum name: " +
+ checksum_func_name + " and checksum " + cur_checksum);
+ }
+ return Status::OK();
+ }
+
+ public:
+ FileChecksumTestHelper(Options& options, DB* db, std::string db_name)
+ : options_(options), db_(db), dbname_(db_name) {}
+ ~FileChecksumTestHelper() {}
+
+ // Verify the checksum information in Manifest.
+ Status VerifyChecksumInManifest(
+ const std::vector<LiveFileMetaData>& live_files) {
+ // Step 1: verify if the dbname_ is correct
+ if (dbname_[dbname_.length() - 1] != '/') {
+ dbname_.append("/");
+ }
+
+ // Step 2, get the the checksum information by recovering the VersionSet
+ // from Manifest.
+ std::unique_ptr<FileChecksumList> checksum_list(NewFileChecksumList());
+ EnvOptions sopt;
+ std::shared_ptr<Cache> tc(NewLRUCache(options_.max_open_files - 10,
+ options_.table_cache_numshardbits));
+ options_.db_paths.emplace_back(dbname_, 0);
+ options_.num_levels = 64;
+ WriteController wc(options_.delayed_write_rate);
+ WriteBufferManager wb(options_.db_write_buffer_size);
+ ImmutableDBOptions immutable_db_options(options_);
+ VersionSet versions(dbname_, &immutable_db_options, sopt, tc.get(), &wb,
+ &wc, nullptr);
+ std::vector<std::string> cf_name_list;
+ Status s;
+ s = versions.ListColumnFamilies(&cf_name_list, dbname_,
+ options_.file_system.get());
+ if (s.ok()) {
+ std::vector<ColumnFamilyDescriptor> cf_list;
+ for (const auto& name : cf_name_list) {
+ fprintf(stdout, "cf_name: %s", name.c_str());
+ cf_list.emplace_back(name, ColumnFamilyOptions(options_));
+ }
+ s = versions.Recover(cf_list, true);
+ }
+ if (s.ok()) {
+ s = versions.GetLiveFilesChecksumInfo(checksum_list.get());
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Step 3 verify the checksum
+ if (live_files.size() != checksum_list->size()) {
+ return Status::Corruption("The number of files does not match!");
+ }
+ for (size_t i = 0; i < live_files.size(); i++) {
+ std::string stored_checksum = "";
+ std::string stored_func_name = "";
+ s = checksum_list->SearchOneFileChecksum(
+ live_files[i].file_number, &stored_checksum, &stored_func_name);
+ if (s.IsNotFound()) {
+ return s;
+ }
+ if (live_files[i].file_checksum != stored_checksum ||
+ live_files[i].file_checksum_func_name != stored_func_name) {
+ return Status::Corruption(
+ "Checksum does not match! The file: " +
+ ToString(live_files[i].file_number) +
+ ". In Manifest, checksum name: " + stored_func_name +
+ " and checksum " + stored_checksum +
+ ". However, expected checksum name: " +
+ live_files[i].file_checksum_func_name + " and checksum " +
+ live_files[i].file_checksum);
+ }
+ }
+ return Status::OK();
+ }
+
+ // Verify the checksum of each file by recalculting the checksum and
+ // comparing it with the one being generated when a SST file is created.
+ Status VerifyEachFileChecksum() {
+ assert(db_ != nullptr);
+ std::vector<LiveFileMetaData> live_files;
+ db_->GetLiveFilesMetaData(&live_files);
+ for (auto a_file : live_files) {
+ Status cs = VerifyChecksum(a_file);
+ if (!cs.ok()) {
+ return cs;
+ }
+ }
+ return Status::OK();
+ }
+};
+
+TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) {
+ Env* base_env = TryLoadCustomOrDefaultEnv();
+ std::unique_ptr<Env> env(NewMemEnv(base_env));
+ Options opts;
+ opts.env = env.get();
+ opts.create_if_missing = true;
+ opts.file_system.reset(new LegacyFileSystemWrapper(opts.env));
+
+ DB* db = nullptr;
+ std::string dbname = test::TmpDir();
+ ASSERT_OK(DB::Open(opts, dbname, &db));
+
+ WriteOptions wopts;
+ FlushOptions fopts;
+ fopts.wait = true;
+ Random rnd(test::RandomSeed());
+ for (int i = 0; i < 200; i++) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%08d", i);
+ std::string v;
+ test::RandomString(&rnd, 100, &v);
+ ASSERT_OK(db->Put(wopts, buf, v));
+ }
+ ASSERT_OK(db->Flush(fopts));
+ for (int i = 100; i < 300; i++) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%08d", i);
+ std::string v;
+ test::RandomString(&rnd, 100, &v);
+ ASSERT_OK(db->Put(wopts, buf, v));
+ }
+ ASSERT_OK(db->Flush(fopts));
+ for (int i = 200; i < 400; i++) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%08d", i);
+ std::string v;
+ test::RandomString(&rnd, 100, &v);
+ ASSERT_OK(db->Put(wopts, buf, v));
+ }
+ ASSERT_OK(db->Flush(fopts));
+ for (int i = 300; i < 400; i++) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%08d", i);
+ std::string v;
+ test::RandomString(&rnd, 100, &v);
+ ASSERT_OK(db->Put(wopts, buf, v));
+ }
+ ASSERT_OK(db->Flush(fopts));
+
+ char arg1[] = "./ldb";
+ char arg2[1024];
+ snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+ char arg3[] = "file_checksum_dump";
+ char* argv[] = {arg1, arg2, arg3};
+
+ ASSERT_EQ(0,
+ LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+ // Verify each sst file checksum value and checksum name
+ FileChecksumTestHelper fct_helper(opts, db, dbname);
+ ASSERT_OK(fct_helper.VerifyEachFileChecksum());
+
+ // Manually trigger compaction
+ char b_buf[16];
+ snprintf(b_buf, sizeof(b_buf), "%08d", 0);
+ char e_buf[16];
+ snprintf(e_buf, sizeof(e_buf), "%08d", 399);
+ Slice begin(b_buf);
+ Slice end(e_buf);
+ CompactRangeOptions options;
+ ASSERT_OK(db->CompactRange(options, &begin, &end));
+ // Verify each sst file checksum after compaction
+ FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+ ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
+
+ ASSERT_EQ(0,
+ LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+ // Verify the checksum information in memory is the same as that in Manifest;
+ std::vector<LiveFileMetaData> live_files;
+ db->GetLiveFilesMetaData(&live_files);
+ delete db;
+ ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files));
+}
+
+TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
+ Env* base_env = TryLoadCustomOrDefaultEnv();
+ std::unique_ptr<Env> env(NewMemEnv(base_env));
+ Options opts;
+ opts.env = env.get();
+ opts.create_if_missing = true;
+ opts.sst_file_checksum_func =
+ std::shared_ptr<FileChecksumFunc>(CreateFileChecksumFuncCrc32c());
+ opts.file_system.reset(new LegacyFileSystemWrapper(opts.env));
+
+ DB* db = nullptr;
+ std::string dbname = test::TmpDir();
+ ASSERT_OK(DB::Open(opts, dbname, &db));
+
+ WriteOptions wopts;
+ FlushOptions fopts;
+ fopts.wait = true;
+ Random rnd(test::RandomSeed());
+ for (int i = 0; i < 100; i++) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%08d", i);
+ std::string v;
+ test::RandomString(&rnd, 100, &v);
+ ASSERT_OK(db->Put(wopts, buf, v));
+ }
+ ASSERT_OK(db->Flush(fopts));
+ for (int i = 50; i < 150; i++) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%08d", i);
+ std::string v;
+ test::RandomString(&rnd, 100, &v);
+ ASSERT_OK(db->Put(wopts, buf, v));
+ }
+ ASSERT_OK(db->Flush(fopts));
+ for (int i = 100; i < 200; i++) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%08d", i);
+ std::string v;
+ test::RandomString(&rnd, 100, &v);
+ ASSERT_OK(db->Put(wopts, buf, v));
+ }
+ ASSERT_OK(db->Flush(fopts));
+ for (int i = 150; i < 250; i++) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%08d", i);
+ std::string v;
+ test::RandomString(&rnd, 100, &v);
+ ASSERT_OK(db->Put(wopts, buf, v));
+ }
+ ASSERT_OK(db->Flush(fopts));
+
+ char arg1[] = "./ldb";
+ char arg2[1024];
+ snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+ char arg3[] = "file_checksum_dump";
+ char* argv[] = {arg1, arg2, arg3};
+
+ ASSERT_EQ(0,
+ LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+ // Verify each sst file checksum value and checksum name
+ FileChecksumTestHelper fct_helper(opts, db, dbname);
+ ASSERT_OK(fct_helper.VerifyEachFileChecksum());
+
+ // Manually trigger compaction
+ char b_buf[16];
+ snprintf(b_buf, sizeof(b_buf), "%08d", 0);
+ char e_buf[16];
+ snprintf(e_buf, sizeof(e_buf), "%08d", 249);
+ Slice begin(b_buf);
+ Slice end(e_buf);
+ CompactRangeOptions options;
+ ASSERT_OK(db->CompactRange(options, &begin, &end));
+ // Verify each sst file checksum after compaction
+ FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+ ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
+
+ ASSERT_EQ(0,
+ LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+ // Verify the checksum information in memory is the same as that in Manifest;
+ std::vector<LiveFileMetaData> live_files;
+ db->GetLiveFilesMetaData(&live_files);
+ delete db;
+ ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files));
+}
+
+TEST_F(LdbCmdTest, OptionParsing) {
+ // test parsing flags
+ Options opts;
+ opts.env = TryLoadCustomOrDefaultEnv();
+ {
+ std::vector<std::string> args;
+ args.push_back("scan");
+ args.push_back("--ttl");
+ args.push_back("--timestamp");
+ LDBCommand* command = ROCKSDB_NAMESPACE::LDBCommand::InitFromCmdLineArgs(
+ args, opts, LDBOptions(), nullptr);
+ const std::vector<std::string> flags = command->TEST_GetFlags();
+ EXPECT_EQ(flags.size(), 2);
+ EXPECT_EQ(flags[0], "ttl");
+ EXPECT_EQ(flags[1], "timestamp");
+ delete command;
+ }
+ // test parsing options which contains equal sign in the option value
+ {
+ std::vector<std::string> args;
+ args.push_back("scan");
+ args.push_back("--db=/dev/shm/ldbtest/");
+ args.push_back(
+ "--from='abcd/efg/hijk/lmn/"
+ "opq:__rst.uvw.xyz?a=3+4+bcd+efghi&jk=lm_no&pq=rst-0&uv=wx-8&yz=a&bcd_"
+ "ef=gh.ijk'");
+ LDBCommand* command = ROCKSDB_NAMESPACE::LDBCommand::InitFromCmdLineArgs(
+ args, opts, LDBOptions(), nullptr);
+ const std::map<std::string, std::string> option_map =
+ command->TEST_GetOptionMap();
+ EXPECT_EQ(option_map.at("db"), "/dev/shm/ldbtest/");
+ EXPECT_EQ(option_map.at("from"),
+ "'abcd/efg/hijk/lmn/"
+ "opq:__rst.uvw.xyz?a=3+4+bcd+efghi&jk=lm_no&pq=rst-0&uv=wx-8&yz="
+ "a&bcd_ef=gh.ijk'");
+ delete command;
+ }
+}
+
+TEST_F(LdbCmdTest, ListFileTombstone) {
+ Env* base_env = TryLoadCustomOrDefaultEnv();
+ std::unique_ptr<Env> env(NewMemEnv(base_env));
+ Options opts;
+ opts.env = env.get();
+ opts.create_if_missing = true;
+
+ DB* db = nullptr;
+ std::string dbname = test::TmpDir();
+ ASSERT_OK(DB::Open(opts, dbname, &db));
+
+ WriteOptions wopts;
+ ASSERT_OK(db->Put(wopts, "foo", "1"));
+ ASSERT_OK(db->Put(wopts, "bar", "2"));
+
+ FlushOptions fopts;
+ fopts.wait = true;
+ ASSERT_OK(db->Flush(fopts));
+
+ ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "foo", "foo2"));
+ ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "bar", "foo2"));
+ ASSERT_OK(db->Flush(fopts));
+
+ delete db;
+
+ {
+ char arg1[] = "./ldb";
+ char arg2[1024];
+ snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+ char arg3[] = "list_file_range_deletes";
+ char* argv[] = {arg1, arg2, arg3};
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) {
+ std::string* out_str = reinterpret_cast<std::string*>(arg);
+
+ // Count number of tombstones printed
+ int num_tb = 0;
+ const std::string kFingerprintStr = "start: ";
+ auto offset = out_str->find(kFingerprintStr);
+ while (offset != std::string::npos) {
+ num_tb++;
+ offset =
+ out_str->find(kFingerprintStr, offset + kFingerprintStr.size());
+ }
+ EXPECT_EQ(2, num_tb);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_EQ(
+ 0, LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+
+ // Test the case of limiting tombstones
+ {
+ char arg1[] = "./ldb";
+ char arg2[1024];
+ snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+ char arg3[] = "list_file_range_deletes";
+ char arg4[] = "--max_keys=1";
+ char* argv[] = {arg1, arg2, arg3, arg4};
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) {
+ std::string* out_str = reinterpret_cast<std::string*>(arg);
+
+ // Count number of tombstones printed
+ int num_tb = 0;
+ const std::string kFingerprintStr = "start: ";
+ auto offset = out_str->find(kFingerprintStr);
+ while (offset != std::string::npos) {
+ num_tb++;
+ offset =
+ out_str->find(kFingerprintStr, offset + kFingerprintStr.size());
+ }
+ EXPECT_EQ(1, num_tb);
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_EQ(
+ 0, LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as LDBCommand is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/ldb_test.py b/src/rocksdb/tools/ldb_test.py
new file mode 100644
index 000000000..74bb7fb16
--- /dev/null
+++ b/src/rocksdb/tools/ldb_test.py
@@ -0,0 +1,595 @@
+#!/usr/bin/env python2
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import os
+import glob
+import os.path
+import shutil
+import subprocess
+import time
+import unittest
+import tempfile
+import re
+
+def my_check_output(*popenargs, **kwargs):
+ """
+ If we had python 2.7, we should simply use subprocess.check_output.
+ This is a stop-gap solution for python 2.6
+ """
+ if 'stdout' in kwargs:
+ raise ValueError('stdout argument not allowed, it will be overridden.')
+ process = subprocess.Popen(stderr=subprocess.PIPE, stdout=subprocess.PIPE,
+ *popenargs, **kwargs)
+ output, unused_err = process.communicate()
+ retcode = process.poll()
+ if retcode:
+ cmd = kwargs.get("args")
+ if cmd is None:
+ cmd = popenargs[0]
+ raise Exception("Exit code is not 0. It is %d. Command: %s" %
+ (retcode, cmd))
+ return output
+
+def run_err_null(cmd):
+ return os.system(cmd + " 2>/dev/null ")
+
+class LDBTestCase(unittest.TestCase):
+ def setUp(self):
+ self.TMP_DIR = tempfile.mkdtemp(prefix="ldb_test_")
+ self.DB_NAME = "testdb"
+
+ def tearDown(self):
+ assert(self.TMP_DIR.strip() != "/"
+ and self.TMP_DIR.strip() != "/tmp"
+ and self.TMP_DIR.strip() != "/tmp/") #Just some paranoia
+
+ shutil.rmtree(self.TMP_DIR)
+
+ def dbParam(self, dbName):
+ return "--db=%s" % os.path.join(self.TMP_DIR, dbName)
+
+ def assertRunOKFull(self, params, expectedOutput, unexpected=False,
+ isPattern=False):
+ """
+ All command-line params must be specified.
+ Allows full flexibility in testing; for example: missing db param.
+
+ """
+ output = my_check_output("./ldb %s |grep -v \"Created bg thread\"" %
+ params, shell=True)
+ if not unexpected:
+ if isPattern:
+ self.assertNotEqual(expectedOutput.search(output.strip()),
+ None)
+ else:
+ self.assertEqual(output.strip(), expectedOutput.strip())
+ else:
+ if isPattern:
+ self.assertEqual(expectedOutput.search(output.strip()), None)
+ else:
+ self.assertNotEqual(output.strip(), expectedOutput.strip())
+
+ def assertRunFAILFull(self, params):
+ """
+ All command-line params must be specified.
+ Allows full flexibility in testing; for example: missing db param.
+
+ """
+ try:
+
+ my_check_output("./ldb %s >/dev/null 2>&1 |grep -v \"Created bg \
+ thread\"" % params, shell=True)
+ except Exception:
+ return
+ self.fail(
+ "Exception should have been raised for command with params: %s" %
+ params)
+
+ def assertRunOK(self, params, expectedOutput, unexpected=False):
+ """
+ Uses the default test db.
+
+ """
+ self.assertRunOKFull("%s %s" % (self.dbParam(self.DB_NAME), params),
+ expectedOutput, unexpected)
+
+ def assertRunFAIL(self, params):
+ """
+ Uses the default test db.
+ """
+ self.assertRunFAILFull("%s %s" % (self.dbParam(self.DB_NAME), params))
+
+ def testSimpleStringPutGet(self):
+ print "Running testSimpleStringPutGet..."
+ self.assertRunFAIL("put x1 y1")
+ self.assertRunOK("put --create_if_missing x1 y1", "OK")
+ self.assertRunOK("get x1", "y1")
+ self.assertRunFAIL("get x2")
+
+ self.assertRunOK("put x2 y2", "OK")
+ self.assertRunOK("get x1", "y1")
+ self.assertRunOK("get x2", "y2")
+ self.assertRunFAIL("get x3")
+
+ self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2")
+ self.assertRunOK("put x3 y3", "OK")
+
+ self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2\nx3 : y3")
+ self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3")
+ self.assertRunOK("scan --from=x", "x1 : y1\nx2 : y2\nx3 : y3")
+
+ self.assertRunOK("scan --to=x2", "x1 : y1")
+ self.assertRunOK("scan --from=x1 --to=z --max_keys=1", "x1 : y1")
+ self.assertRunOK("scan --from=x1 --to=z --max_keys=2",
+ "x1 : y1\nx2 : y2")
+
+ self.assertRunOK("scan --from=x1 --to=z --max_keys=3",
+ "x1 : y1\nx2 : y2\nx3 : y3")
+ self.assertRunOK("scan --from=x1 --to=z --max_keys=4",
+ "x1 : y1\nx2 : y2\nx3 : y3")
+ self.assertRunOK("scan --from=x1 --to=x2", "x1 : y1")
+ self.assertRunOK("scan --from=x2 --to=x4", "x2 : y2\nx3 : y3")
+ self.assertRunFAIL("scan --from=x4 --to=z") # No results => FAIL
+ self.assertRunFAIL("scan --from=x1 --to=z --max_keys=foo")
+
+ self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3")
+
+ self.assertRunOK("delete x1", "OK")
+ self.assertRunOK("scan", "x2 : y2\nx3 : y3")
+
+ self.assertRunOK("delete NonExistentKey", "OK")
+ # It is weird that GET and SCAN raise exception for
+ # non-existent key, while delete does not
+
+ self.assertRunOK("checkconsistency", "OK")
+
+ def dumpDb(self, params, dumpFile):
+ return 0 == run_err_null("./ldb dump %s > %s" % (params, dumpFile))
+
+ def loadDb(self, params, dumpFile):
+ return 0 == run_err_null("cat %s | ./ldb load %s" % (dumpFile, params))
+
+ def writeExternSst(self, params, inputDumpFile, outputSst):
+ return 0 == run_err_null("cat %s | ./ldb write_extern_sst %s %s"
+ % (inputDumpFile, outputSst, params))
+
+ def ingestExternSst(self, params, inputSst):
+ return 0 == run_err_null("./ldb ingest_extern_sst %s %s"
+ % (inputSst, params))
+
+ def testStringBatchPut(self):
+ print "Running testStringBatchPut..."
+ self.assertRunOK("batchput x1 y1 --create_if_missing", "OK")
+ self.assertRunOK("scan", "x1 : y1")
+ self.assertRunOK("batchput x2 y2 x3 y3 \"x4 abc\" \"y4 xyz\"", "OK")
+ self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz")
+ self.assertRunFAIL("batchput")
+ self.assertRunFAIL("batchput k1")
+ self.assertRunFAIL("batchput k1 v1 k2")
+
+ def testCountDelimDump(self):
+ print "Running testCountDelimDump..."
+ self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
+ self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
+ self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+ self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+ self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK")
+ self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8")
+
+ def testCountDelimIDump(self):
+ print "Running testCountDelimIDump..."
+ self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
+ self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
+ self.assertRunOK("idump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+ self.assertRunOK("idump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+ self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK")
+ self.assertRunOK("idump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8")
+
+ def testInvalidCmdLines(self):
+ print "Running testInvalidCmdLines..."
+ # db not specified
+ self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
+ # No param called he
+ self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing")
+ # max_keys is not applicable for put
+ self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing")
+ # hex has invalid boolean value
+
+ def testHexPutGet(self):
+ print "Running testHexPutGet..."
+ self.assertRunOK("put a1 b1 --create_if_missing", "OK")
+ self.assertRunOK("scan", "a1 : b1")
+ self.assertRunOK("scan --hex", "0x6131 : 0x6231")
+ self.assertRunFAIL("put --hex 6132 6232")
+ self.assertRunOK("put --hex 0x6132 0x6232", "OK")
+ self.assertRunOK("scan --hex", "0x6131 : 0x6231\n0x6132 : 0x6232")
+ self.assertRunOK("scan", "a1 : b1\na2 : b2")
+ self.assertRunOK("get a1", "b1")
+ self.assertRunOK("get --hex 0x6131", "0x6231")
+ self.assertRunOK("get a2", "b2")
+ self.assertRunOK("get --hex 0x6132", "0x6232")
+ self.assertRunOK("get --key_hex 0x6132", "b2")
+ self.assertRunOK("get --key_hex --value_hex 0x6132", "0x6232")
+ self.assertRunOK("get --value_hex a2", "0x6232")
+ self.assertRunOK("scan --key_hex --value_hex",
+ "0x6131 : 0x6231\n0x6132 : 0x6232")
+ self.assertRunOK("scan --hex --from=0x6131 --to=0x6133",
+ "0x6131 : 0x6231\n0x6132 : 0x6232")
+ self.assertRunOK("scan --hex --from=0x6131 --to=0x6132",
+ "0x6131 : 0x6231")
+ self.assertRunOK("scan --key_hex", "0x6131 : b1\n0x6132 : b2")
+ self.assertRunOK("scan --value_hex", "a1 : 0x6231\na2 : 0x6232")
+ self.assertRunOK("batchput --hex 0x6133 0x6233 0x6134 0x6234", "OK")
+ self.assertRunOK("scan", "a1 : b1\na2 : b2\na3 : b3\na4 : b4")
+ self.assertRunOK("delete --hex 0x6133", "OK")
+ self.assertRunOK("scan", "a1 : b1\na2 : b2\na4 : b4")
+ self.assertRunOK("checkconsistency", "OK")
+
+ def testTtlPutGet(self):
+ print "Running testTtlPutGet..."
+ self.assertRunOK("put a1 b1 --ttl --create_if_missing", "OK")
+ self.assertRunOK("scan --hex", "0x6131 : 0x6231", True)
+ self.assertRunOK("dump --ttl ", "a1 ==> b1", True)
+ self.assertRunOK("dump --hex --ttl ",
+ "0x6131 ==> 0x6231\nKeys in range: 1")
+ self.assertRunOK("scan --hex --ttl", "0x6131 : 0x6231")
+ self.assertRunOK("get --value_hex a1", "0x6231", True)
+ self.assertRunOK("get --ttl a1", "b1")
+ self.assertRunOK("put a3 b3 --create_if_missing", "OK")
+ # fails because timstamp's length is greater than value's
+ self.assertRunFAIL("get --ttl a3")
+ self.assertRunOK("checkconsistency", "OK")
+
+ def testInvalidCmdLines(self): # noqa: F811 T25377293 Grandfathered in
+ print "Running testInvalidCmdLines..."
+ # db not specified
+ self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
+ # No param called he
+ self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing")
+ # max_keys is not applicable for put
+ self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing")
+ # hex has invalid boolean value
+ self.assertRunFAIL("put 0x6133 0x6233 --hex=Boo --create_if_missing")
+
+ def testDumpLoad(self):
+ print "Running testDumpLoad..."
+ self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4",
+ "OK")
+ self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+ origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+
+ # Dump and load without any additional params specified
+ dumpFilePath = os.path.join(self.TMP_DIR, "dump1")
+ loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump1")
+ self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+ self.assertTrue(self.loadDb(
+ "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+ self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+ "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+ # Dump and load in hex
+ dumpFilePath = os.path.join(self.TMP_DIR, "dump2")
+ loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump2")
+ self.assertTrue(self.dumpDb("--db=%s --hex" % origDbPath, dumpFilePath))
+ self.assertTrue(self.loadDb(
+ "--db=%s --hex --create_if_missing" % loadedDbPath, dumpFilePath))
+ self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+ "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+ # Dump only a portion of the key range
+ dumpFilePath = os.path.join(self.TMP_DIR, "dump3")
+ loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump3")
+ self.assertTrue(self.dumpDb(
+ "--db=%s --from=x1 --to=x3" % origDbPath, dumpFilePath))
+ self.assertTrue(self.loadDb(
+ "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+ self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2")
+
+ # Dump upto max_keys rows
+ dumpFilePath = os.path.join(self.TMP_DIR, "dump4")
+ loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump4")
+ self.assertTrue(self.dumpDb(
+ "--db=%s --max_keys=3" % origDbPath, dumpFilePath))
+ self.assertTrue(self.loadDb(
+ "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+ self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+ "x1 : y1\nx2 : y2\nx3 : y3")
+
+ # Load into an existing db, create_if_missing is not specified
+ self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+ self.assertTrue(self.loadDb("--db=%s" % loadedDbPath, dumpFilePath))
+ self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+ "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+ # Dump and load with WAL disabled
+ dumpFilePath = os.path.join(self.TMP_DIR, "dump5")
+ loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump5")
+ self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+ self.assertTrue(self.loadDb(
+ "--db=%s --disable_wal --create_if_missing" % loadedDbPath,
+ dumpFilePath))
+ self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+ "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+ # Dump and load with lots of extra params specified
+ extraParams = " ".join(["--bloom_bits=14", "--block_size=1024",
+ "--auto_compaction=true",
+ "--write_buffer_size=4194304",
+ "--file_size=2097152"])
+ dumpFilePath = os.path.join(self.TMP_DIR, "dump6")
+ loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump6")
+ self.assertTrue(self.dumpDb(
+ "--db=%s %s" % (origDbPath, extraParams), dumpFilePath))
+ self.assertTrue(self.loadDb(
+ "--db=%s %s --create_if_missing" % (loadedDbPath, extraParams),
+ dumpFilePath))
+ self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+ "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+ # Dump with count_only
+ dumpFilePath = os.path.join(self.TMP_DIR, "dump7")
+ loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump7")
+ self.assertTrue(self.dumpDb(
+ "--db=%s --count_only" % origDbPath, dumpFilePath))
+ self.assertTrue(self.loadDb(
+ "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+ # DB should have atleast one value for scan to work
+ self.assertRunOKFull("put --db=%s k1 v1" % loadedDbPath, "OK")
+ self.assertRunOKFull("scan --db=%s" % loadedDbPath, "k1 : v1")
+
+ # Dump command fails because of typo in params
+ dumpFilePath = os.path.join(self.TMP_DIR, "dump8")
+ self.assertFalse(self.dumpDb(
+ "--db=%s --create_if_missing" % origDbPath, dumpFilePath))
+
+ def testIDumpBasics(self):
+ print "Running testIDumpBasics..."
+ self.assertRunOK("put a val --create_if_missing", "OK")
+ self.assertRunOK("put b val", "OK")
+ self.assertRunOK(
+ "idump", "'a' seq:1, type:1 => val\n"
+ "'b' seq:2, type:1 => val\nInternal keys in range: 2")
+ self.assertRunOK(
+ "idump --input_key_hex --from=%s --to=%s" % (hex(ord('a')),
+ hex(ord('b'))),
+ "'a' seq:1, type:1 => val\nInternal keys in range: 1")
+
+ def testMiscAdminTask(self):
+ print "Running testMiscAdminTask..."
+ # These tests need to be improved; for example with asserts about
+ # whether compaction or level reduction actually took place.
+ self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4",
+ "OK")
+ self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+ origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+
+ self.assertTrue(0 == run_err_null(
+ "./ldb compact --db=%s" % origDbPath))
+ self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+ self.assertTrue(0 == run_err_null(
+ "./ldb reduce_levels --db=%s --new_levels=2" % origDbPath))
+ self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+ self.assertTrue(0 == run_err_null(
+ "./ldb reduce_levels --db=%s --new_levels=3" % origDbPath))
+ self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+ self.assertTrue(0 == run_err_null(
+ "./ldb compact --db=%s --from=x1 --to=x3" % origDbPath))
+ self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+ self.assertTrue(0 == run_err_null(
+ "./ldb compact --db=%s --hex --from=0x6131 --to=0x6134"
+ % origDbPath))
+ self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+ #TODO(dilip): Not sure what should be passed to WAL.Currently corrupted.
+ self.assertTrue(0 == run_err_null(
+ "./ldb dump_wal --db=%s --walfile=%s --header" % (
+ origDbPath, os.path.join(origDbPath, "LOG"))))
+ self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+ def testCheckConsistency(self):
+ print "Running testCheckConsistency..."
+
+ dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+ self.assertRunOK("put x1 y1 --create_if_missing", "OK")
+ self.assertRunOK("put x2 y2", "OK")
+ self.assertRunOK("get x1", "y1")
+ self.assertRunOK("checkconsistency", "OK")
+
+ sstFilePath = my_check_output("ls %s" % os.path.join(dbPath, "*.sst"),
+ shell=True)
+
+ # Modify the file
+ my_check_output("echo 'evil' > %s" % sstFilePath, shell=True)
+ self.assertRunFAIL("checkconsistency")
+
+ # Delete the file
+ my_check_output("rm -f %s" % sstFilePath, shell=True)
+ self.assertRunFAIL("checkconsistency")
+
+ def dumpLiveFiles(self, params, dumpFile):
+ return 0 == run_err_null("./ldb dump_live_files %s > %s" % (
+ params, dumpFile))
+
+ def testDumpLiveFiles(self):
+ print "Running testDumpLiveFiles..."
+
+ dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+ self.assertRunOK("put x1 y1 --create_if_missing", "OK")
+ self.assertRunOK("put x2 y2", "OK")
+ dumpFilePath = os.path.join(self.TMP_DIR, "dump1")
+ self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath))
+ self.assertRunOK("delete x1", "OK")
+ self.assertRunOK("put x3 y3", "OK")
+ dumpFilePath = os.path.join(self.TMP_DIR, "dump2")
+ self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath))
+
+ def getManifests(self, directory):
+ return glob.glob(directory + "/MANIFEST-*")
+
+ def getSSTFiles(self, directory):
+ return glob.glob(directory + "/*.sst")
+
+ def getWALFiles(self, directory):
+ return glob.glob(directory + "/*.log")
+
+ def copyManifests(self, src, dest):
+ return 0 == run_err_null("cp " + src + " " + dest)
+
+ def testManifestDump(self):
+ print "Running testManifestDump..."
+ dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+ self.assertRunOK("put 1 1 --create_if_missing", "OK")
+ self.assertRunOK("put 2 2", "OK")
+ self.assertRunOK("put 3 3", "OK")
+ # Pattern to expect from manifest_dump.
+ num = "[0-9]+"
+ st = ".*"
+ subpat = st + " seq:" + num + ", type:" + num
+ regex = num + ":" + num + "\[" + subpat + ".." + subpat + "\]"
+ expected_pattern = re.compile(regex)
+ cmd = "manifest_dump --db=%s"
+ manifest_files = self.getManifests(dbPath)
+ self.assertTrue(len(manifest_files) == 1)
+ # Test with the default manifest file in dbPath.
+ self.assertRunOKFull(cmd % dbPath, expected_pattern,
+ unexpected=False, isPattern=True)
+ self.copyManifests(manifest_files[0], manifest_files[0] + "1")
+ manifest_files = self.getManifests(dbPath)
+ self.assertTrue(len(manifest_files) == 2)
+ # Test with multiple manifest files in dbPath.
+ self.assertRunFAILFull(cmd % dbPath)
+ # Running it with the copy we just created should pass.
+ self.assertRunOKFull((cmd + " --path=%s")
+ % (dbPath, manifest_files[1]),
+ expected_pattern, unexpected=False,
+ isPattern=True)
+ # Make sure that using the dump with --path will result in identical
+ # output as just using manifest_dump.
+ cmd = "dump --path=%s"
+ self.assertRunOKFull((cmd)
+ % (manifest_files[1]),
+ expected_pattern, unexpected=False,
+ isPattern=True)
+
+ def testSSTDump(self):
+ print "Running testSSTDump..."
+
+ dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+ self.assertRunOK("put sst1 sst1_val --create_if_missing", "OK")
+ self.assertRunOK("put sst2 sst2_val", "OK")
+ self.assertRunOK("get sst1", "sst1_val")
+
+ # Pattern to expect from SST dump.
+ regex = ".*Sst file format:.*"
+ expected_pattern = re.compile(regex)
+
+ sst_files = self.getSSTFiles(dbPath)
+ self.assertTrue(len(sst_files) >= 1)
+ cmd = "dump --path=%s"
+ self.assertRunOKFull((cmd)
+ % (sst_files[0]),
+ expected_pattern, unexpected=False,
+ isPattern=True)
+
+ def testWALDump(self):
+ print "Running testWALDump..."
+
+ dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+ self.assertRunOK("put wal1 wal1_val --create_if_missing", "OK")
+ self.assertRunOK("put wal2 wal2_val", "OK")
+ self.assertRunOK("get wal1", "wal1_val")
+
+ # Pattern to expect from WAL dump.
+ regex = "^Sequence,Count,ByteSize,Physical Offset,Key\(s\).*"
+ expected_pattern = re.compile(regex)
+
+ wal_files = self.getWALFiles(dbPath)
+ self.assertTrue(len(wal_files) >= 1)
+ cmd = "dump --path=%s"
+ self.assertRunOKFull((cmd)
+ % (wal_files[0]),
+ expected_pattern, unexpected=False,
+ isPattern=True)
+
+ def testListColumnFamilies(self):
+ print "Running testListColumnFamilies..."
+ self.assertRunOK("put x1 y1 --create_if_missing", "OK")
+ cmd = "list_column_families | grep -v \"Column families\""
+ # Test on valid dbPath.
+ self.assertRunOK(cmd, "{default}")
+ # Test on empty path.
+ self.assertRunFAIL(cmd)
+
+ def testColumnFamilies(self):
+ print "Running testColumnFamilies..."
+ dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) # noqa: F841 T25377293 Grandfathered in
+ self.assertRunOK("put cf1_1 1 --create_if_missing", "OK")
+ self.assertRunOK("put cf1_2 2 --create_if_missing", "OK")
+ self.assertRunOK("put cf1_3 3 --try_load_options", "OK")
+ # Given non-default column family to single CF DB.
+ self.assertRunFAIL("get cf1_1 --column_family=two")
+ self.assertRunOK("create_column_family two", "OK")
+ self.assertRunOK("put cf2_1 1 --create_if_missing --column_family=two",
+ "OK")
+ self.assertRunOK("put cf2_2 2 --create_if_missing --column_family=two",
+ "OK")
+ self.assertRunOK("delete cf1_2", "OK")
+ self.assertRunOK("create_column_family three", "OK")
+ self.assertRunOK("delete cf2_2 --column_family=two", "OK")
+ self.assertRunOK(
+ "put cf3_1 3 --create_if_missing --column_family=three",
+ "OK")
+ self.assertRunOK("get cf1_1 --column_family=default", "1")
+ self.assertRunOK("dump --column_family=two",
+ "cf2_1 ==> 1\nKeys in range: 1")
+ self.assertRunOK("dump --column_family=two --try_load_options",
+ "cf2_1 ==> 1\nKeys in range: 1")
+ self.assertRunOK("dump",
+ "cf1_1 ==> 1\ncf1_3 ==> 3\nKeys in range: 2")
+ self.assertRunOK("get cf2_1 --column_family=two",
+ "1")
+ self.assertRunOK("get cf3_1 --column_family=three",
+ "3")
+ self.assertRunOK("drop_column_family three", "OK")
+ # non-existing column family.
+ self.assertRunFAIL("get cf3_1 --column_family=four")
+ self.assertRunFAIL("drop_column_family four")
+
+ def testIngestExternalSst(self):
+ print "Running testIngestExternalSst..."
+
+ # Dump, load, write external sst and ingest it in another db
+ dbPath = os.path.join(self.TMP_DIR, "db1")
+ self.assertRunOK(
+ "batchput --db=%s --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4"
+ % dbPath,
+ "OK")
+ self.assertRunOK("scan --db=%s" % dbPath,
+ "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+ dumpFilePath = os.path.join(self.TMP_DIR, "dump1")
+ with open(dumpFilePath, 'w') as f:
+ f.write("x1 ==> y10\nx2 ==> y20\nx3 ==> y30\nx4 ==> y40")
+ externSstPath = os.path.join(self.TMP_DIR, "extern_data1.sst")
+ self.assertTrue(self.writeExternSst("--create_if_missing --db=%s"
+ % dbPath,
+ dumpFilePath,
+ externSstPath))
+ # cannot ingest if allow_global_seqno is false
+ self.assertFalse(
+ self.ingestExternSst(
+ "--create_if_missing --allow_global_seqno=false --db=%s"
+ % dbPath,
+ externSstPath))
+ self.assertTrue(
+ self.ingestExternSst(
+ "--create_if_missing --allow_global_seqno --db=%s"
+ % dbPath,
+ externSstPath))
+ self.assertRunOKFull("scan --db=%s" % dbPath,
+ "x1 : y10\nx2 : y20\nx3 : y30\nx4 : y40")
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/src/rocksdb/tools/ldb_tool.cc b/src/rocksdb/tools/ldb_tool.cc
new file mode 100644
index 000000000..8174b7e0c
--- /dev/null
+++ b/src/rocksdb/tools/ldb_tool.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#include "rocksdb/ldb_tool.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "tools/ldb_cmd_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+LDBOptions::LDBOptions() {}
+
+void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
+ const char* /*exec_name*/) {
+ std::string ret;
+
+ ret.append(ldb_options.print_help_header);
+ ret.append("\n\n");
+ ret.append("commands MUST specify --" + LDBCommand::ARG_DB +
+ "=<full_path_to_db_directory> when necessary\n");
+ ret.append("\n");
+ ret.append("commands can optionally specify --" + LDBCommand::ARG_ENV_URI +
+ "=<uri_of_environment> if necessary\n\n");
+ ret.append(
+ "The following optional parameters control if keys/values are "
+ "input/output as hex or as plain strings:\n");
+ ret.append(" --" + LDBCommand::ARG_KEY_HEX +
+ " : Keys are input/output as hex\n");
+ ret.append(" --" + LDBCommand::ARG_VALUE_HEX +
+ " : Values are input/output as hex\n");
+ ret.append(" --" + LDBCommand::ARG_HEX +
+ " : Both keys and values are input/output as hex\n");
+ ret.append("\n");
+
+ ret.append(
+ "The following optional parameters control the database "
+ "internals:\n");
+ ret.append(
+ " --" + LDBCommand::ARG_CF_NAME +
+ "=<string> : name of the column family to operate on. default: default "
+ "column family\n");
+ ret.append(" --" + LDBCommand::ARG_TTL +
+ " with 'put','get','scan','dump','query','batchput'"
+ " : DB supports ttl and value is internally timestamp-suffixed\n");
+ ret.append(" --" + LDBCommand::ARG_TRY_LOAD_OPTIONS +
+ " : Try to load option file from DB.\n");
+ ret.append(" --" + LDBCommand::ARG_IGNORE_UNKNOWN_OPTIONS +
+ " : Ignore unknown options when loading option file.\n");
+ ret.append(" --" + LDBCommand::ARG_BLOOM_BITS + "=<int,e.g.:14>\n");
+ ret.append(" --" + LDBCommand::ARG_FIX_PREFIX_LEN + "=<int,e.g.:14>\n");
+ ret.append(" --" + LDBCommand::ARG_COMPRESSION_TYPE +
+ "=<no|snappy|zlib|bzip2|lz4|lz4hc|xpress|zstd>\n");
+ ret.append(" --" + LDBCommand::ARG_COMPRESSION_MAX_DICT_BYTES +
+ "=<int,e.g.:16384>\n");
+ ret.append(" --" + LDBCommand::ARG_BLOCK_SIZE + "=<block_size_in_bytes>\n");
+ ret.append(" --" + LDBCommand::ARG_AUTO_COMPACTION + "=<true|false>\n");
+ ret.append(" --" + LDBCommand::ARG_DB_WRITE_BUFFER_SIZE +
+ "=<int,e.g.:16777216>\n");
+ ret.append(" --" + LDBCommand::ARG_WRITE_BUFFER_SIZE +
+ "=<int,e.g.:4194304>\n");
+ ret.append(" --" + LDBCommand::ARG_FILE_SIZE + "=<int,e.g.:2097152>\n");
+
+ ret.append("\n\n");
+ ret.append("Data Access Commands:\n");
+ PutCommand::Help(ret);
+ GetCommand::Help(ret);
+ BatchPutCommand::Help(ret);
+ ScanCommand::Help(ret);
+ DeleteCommand::Help(ret);
+ DeleteRangeCommand::Help(ret);
+ DBQuerierCommand::Help(ret);
+ ApproxSizeCommand::Help(ret);
+ CheckConsistencyCommand::Help(ret);
+ ListFileRangeDeletesCommand::Help(ret);
+
+ ret.append("\n\n");
+ ret.append("Admin Commands:\n");
+ WALDumperCommand::Help(ret);
+ CompactorCommand::Help(ret);
+ ReduceDBLevelsCommand::Help(ret);
+ ChangeCompactionStyleCommand::Help(ret);
+ DBDumperCommand::Help(ret);
+ DBLoaderCommand::Help(ret);
+ ManifestDumpCommand::Help(ret);
+ FileChecksumDumpCommand::Help(ret);
+ ListColumnFamiliesCommand::Help(ret);
+ CreateColumnFamilyCommand::Help(ret);
+ DropColumnFamilyCommand::Help(ret);
+ DBFileDumperCommand::Help(ret);
+ InternalDumpCommand::Help(ret);
+ RepairCommand::Help(ret);
+ BackupCommand::Help(ret);
+ RestoreCommand::Help(ret);
+ CheckPointCommand::Help(ret);
+ WriteExternalSstFilesCommand::Help(ret);
+ IngestExternalSstFilesCommand::Help(ret);
+
+ fprintf(stderr, "%s\n", ret.c_str());
+}
+
+int LDBCommandRunner::RunCommand(
+ int argc, char** argv, Options options, const LDBOptions& ldb_options,
+ const std::vector<ColumnFamilyDescriptor>* column_families) {
+ if (argc <= 2) {
+ PrintHelp(ldb_options, argv[0]);
+ return 1;
+ }
+
+ LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs(
+ argc, argv, options, ldb_options, column_families);
+ if (cmdObj == nullptr) {
+ fprintf(stderr, "Unknown command\n");
+ PrintHelp(ldb_options, argv[0]);
+ return 1;
+ }
+
+ if (!cmdObj->ValidateCmdLineOptions()) {
+ return 1;
+ }
+
+ cmdObj->Run();
+ LDBCommandExecuteResult ret = cmdObj->GetExecuteState();
+ fprintf(stderr, "%s\n", ret.ToString().c_str());
+ delete cmdObj;
+
+ return ret.IsFailed() ? 1 : 0;
+}
+
+void LDBTool::Run(int argc, char** argv, Options options,
+ const LDBOptions& ldb_options,
+ const std::vector<ColumnFamilyDescriptor>* column_families) {
+ int error_code = LDBCommandRunner::RunCommand(argc, argv, options,
+ ldb_options, column_families);
+ exit(error_code);
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/pflag b/src/rocksdb/tools/pflag
new file mode 100755
index 000000000..f3394a666
--- /dev/null
+++ b/src/rocksdb/tools/pflag
@@ -0,0 +1,217 @@
+#!/usr/bin/env bash
+#
+#(c) 2004-present, Facebook, all rights reserved.
+# See the LICENSE file for usage and distribution rights.
+#
+
+trap 'echo "Caught exception, dying"; exit' 1 2 3 15
+
+ME=`basename $0`
+SERVER=`hostname`
+
+#parameters used
+#
+Dump_Config=0
+DEBUG=
+OS=`/bin/uname -s`
+VMEM=
+RSS=
+CPU=
+VERBOSE=
+VAR=
+LIMIT=
+ACTION=
+N=
+WAIT=
+
+#
+#supported OS: Linux only for now. Easy to add
+#
+oscheck() {
+ case ${OS} in
+ Linux)
+ VMEM=vsz
+ RSS=rss
+ CPU=bsdtime
+ ;;
+ *)
+ die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks."
+ ;;
+ esac
+}
+
+
+verbose() {
+ if [ "x$DEBUG" != "x" ]; then
+ echo "$@" >&2
+ fi
+}
+
+warn() {
+ echo "$@" >&2
+}
+
+die() {
+ echo "ERROR: " "$@" >&2;
+ exit;
+}
+
+dump_config() {
+ cat <<EOCONFIG;
+$ME running on ${HOSTNAME} at `date`
+
+Configuration for this run:
+ PID to monitor : ${PID}
+ Resource monitored : ${VAR}
+ Resource limit : ${LIMIT}
+ Check every : ${WAIT} seconds
+ No. of times run : ${N}
+ What to do : ${ACTION}
+EOCONFIG
+
+}
+
+usage() {
+ cat <<USAGE; exit
+$@
+
+Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait]
+
+Monitor a process for set of violations. Options:
+
+ -p: PID of process to monitor
+
+ -x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM
+
+ -l: what is the threshold/limit for the metric that is being sensed.
+ Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU
+ NOTE: defaults to 1GB
+
+ -a: action. Currently {warn|die|kill} are supported.
+ The default action is to 'warn'. Here is the behavior:
+
+ warn: complain if usage exceeds threshold, but continue monitoring
+ kill: complain, kill the db_bench process and exit
+ die: if usage exceeds threshold, die immediately
+
+ -n: number of cycles to monitor. Default is to monitor until PID no longer exists.
+
+ -w: wait time per cycle of monitoring. Default is 5 seconds.
+
+ -v: verbose messaging
+
+USAGE
+
+}
+
+#set default values if none given
+set_defaults_if_noopt_given() {
+
+ : ${VAR:=vsz}
+ : ${LIMIT:=1024000}
+ : ${WAIT:=5}
+ : ${N:=999999}
+ : ${ACTION:=warn}
+}
+
+validate_options() {
+ if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then
+ usage "PID is mandatory"
+ fi
+}
+
+###### START
+
+
+ while getopts ":p:x:l:a:n:t:vhd" opt; do
+ case $opt in
+ d)
+ Dump_Config=1
+ ;;
+ h)
+ usage;
+ ;;
+ a)
+ ACTION=${OPTARG};
+ ;;
+ v)
+ DEBUG=1;
+ ;;
+ p)
+ PID=$OPTARG;
+ ;;
+ x)
+ VAR=$OPTARG;
+ ;;
+ l)
+ LIMIT=$OPTARG;
+ ;;
+ w)
+ WAIT=$OPTARG;
+ ;;
+ n)
+ N=$OPTARG;
+ ;;
+ \?)
+ usage;
+ ;;
+ esac
+ done
+
+oscheck;
+set_defaults_if_noopt_given;
+validate_options;
+
+if [ $Dump_Config -eq 1 ]; then
+ dump_config;
+ exit;
+fi
+
+Done=0
+
+verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration";
+
+while [ $Done -eq 0 ]; do
+ VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'`
+ if [ ${VAL:=0} -eq 0 ]; then
+ warn "Process $PID ended without incident."
+ Done=1;
+ break;
+ fi
+
+ if [ $VAL -ge $LIMIT ]; then
+ Done=1;
+ else
+ echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}"
+ sleep $WAIT;
+ fi
+ if [ $Done -eq 1 ]; then
+
+ if [ "$ACTION" = "kill" ]; then
+ kill ${PID} || kill -3 ${PID}
+ exit;
+
+ elif [ "$ACTION" = "warn" ]; then
+
+ # go back to monitoring.
+
+ warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}"
+ Done=0 #go back to monitoring
+
+ elif [ "$ACTION" = "die" ]; then
+ warn "WARNING: dying without killing process ${PID} on ${SERVER}"
+ warn "The process details are below: "
+ warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`"
+ warn ""
+
+ #should we send email/notify someone? TODO... for now, bail.
+
+ exit -1;
+
+ fi
+ else
+ :
+ #warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded";
+ fi
+done
+
diff --git a/src/rocksdb/tools/rdb/.gitignore b/src/rocksdb/tools/rdb/.gitignore
new file mode 100644
index 000000000..378eac25d
--- /dev/null
+++ b/src/rocksdb/tools/rdb/.gitignore
@@ -0,0 +1 @@
+build
diff --git a/src/rocksdb/tools/rdb/API.md b/src/rocksdb/tools/rdb/API.md
new file mode 100644
index 000000000..e9c2e5925
--- /dev/null
+++ b/src/rocksdb/tools/rdb/API.md
@@ -0,0 +1,178 @@
+# JavaScript API
+
+## DBWrapper
+
+### Constructor
+
+ # Creates a new database wrapper object
+ RDB()
+
+### Open
+
+ # Open a new or existing RocksDB database.
+ #
+ # db_name (string) - Location of the database (inside the
+ # `/tmp` directory).
+ # column_families (string[]) - Names of additional column families
+ # beyond the default. If there are no other
+ # column families, this argument can be
+ # left off.
+ #
+ # Returns true if the database was opened successfully, or false otherwise
+ db_obj.(db_name, column_families = [])
+
+### Get
+
+ # Get the value of a given key.
+ #
+ # key (string) - Which key to get the value of.
+ # column_family (string) - Which column family to check for the key.
+ # This argument can be left off for the default
+ # column family
+ #
+ # Returns the value (string) that is associated with the given key if
+ # one exists, or null otherwise.
+ db_obj.get(key, column_family = { default })
+
+### Put
+
+ # Associate a value with a key.
+ #
+ # key (string) - Which key to associate the value with.
+ # value (string) - The value to associate with the key.
+ # column_family (string) - Which column family to put the key-value pair
+ # in. This argument can be left off for the
+ # default column family.
+ #
+ # Returns true if the key-value pair was successfully stored in the
+ # database, or false otherwise.
+ db_obj.put(key, value, column_family = { default })
+
+### Delete
+
+ # Delete a value associated with a given key.
+ #
+ # key (string) - Which key to delete the value of..
+ # column_family (string) - Which column family to check for the key.
+ # This argument can be left off for the default
+ # column family
+ #
+ # Returns true if an error occurred while trying to delete the key in
+ # the database, or false otherwise. Note that this is NOT the same as
+ # whether a value was deleted; in the case of a specified key not having
+ # a value, this will still return true. Use the `get` method prior to
+ # this method to check if a value existed before the call to `delete`.
+ db_obj.delete(key, column_family = { default })
+
+### Dump
+
+ # Print out all the key-value pairs in a given column family of the
+ # database.
+ #
+ # column_family (string) - Which column family to dump the pairs from.
+ # This argument can be left off for the default
+ # column family.
+ #
+ # Returns true if the keys were successfully read from the database, or
+ # false otherwise.
+ db_obj.dump(column_family = { default })
+
+### WriteBatch
+
+ # Execute an atomic batch of writes (i.e. puts and deletes) to the
+ # database.
+ #
+ # cf_batches (BatchObject[]; see below) - Put and Delete writes grouped
+ # by column family to execute
+ # atomically.
+ #
+ # Returns true if the argument array was well-formed and was
+ # successfully written to the database, or false otherwise.
+ db_obj.writeBatch(cf_batches)
+
+### CreateColumnFamily
+
+ # Create a new column family for the database.
+ #
+ # column_family_name (string) - Name of the new column family.
+ #
+ # Returns true if the new column family was successfully created, or
+ # false otherwise.
+ db_obj.createColumnFamily(column_family_name)
+
+### CompactRange
+
+ # Compact the underlying storage for a given range.
+ #
+ # In addition to the endpoints of the range, the method is overloaded to
+ # accept a non-default column family, a set of options, or both.
+ #
+ # begin (string) - First key in the range to compact.
+ # end (string) - Last key in the range to compact.
+ # options (object) - Contains a subset of the following key-value
+ # pairs:
+ # * 'target_level' => int
+ # * 'target_path_id' => int
+ # column_family (string) - Which column family to compact the range in.
+ db_obj.compactRange(begin, end)
+ db_obj.compactRange(begin, end, options)
+ db_obj.compactRange(begin, end, column_family)
+ db_obj.compactRange(begin, end, options, column_family)
+
+
+
+### Close
+
+ # Close an a database and free the memory associated with it.
+ #
+ # Return null.
+ # db_obj.close()
+
+
+## BatchObject
+
+### Structure
+
+A BatchObject must have at least one of the following key-value pairs:
+
+* 'put' => Array of ['string1', 'string1'] pairs, each of which signifies that
+the key 'string1' should be associated with the value 'string2'
+* 'delete' => Array of strings, each of which is a key whose value should be
+deleted.
+
+The following key-value pair is optional:
+
+* 'column_family' => The name (string) of the column family to apply the
+changes to.
+
+### Examples
+
+ # Writes the key-value pairs 'firstname' => 'Saghm' and
+ # 'lastname' => 'Rossi' atomically to the database.
+ db_obj.writeBatch([
+ {
+ put: [ ['firstname', 'Saghm'], ['lastname', 'Rossi'] ]
+ }
+ ]);
+
+
+ # Deletes the values associated with 'firstname' and 'lastname' in
+ # the default column family and adds the key 'number_of_people' with
+ # with the value '2'. Additionally, adds the key-value pair
+ # 'name' => 'Saghm Rossi' to the column family 'user1' and the pair
+ # 'name' => 'Matt Blaze' to the column family 'user2'. All writes
+ # are done atomically.
+ db_obj.writeBatch([
+ {
+ put: [ ['number_of_people', '2'] ],
+ delete: ['firstname', 'lastname']
+ },
+ {
+ put: [ ['name', 'Saghm Rossi'] ],
+ column_family: 'user1'
+ },
+ {
+ put: [ ['name', Matt Blaze'] ],
+ column_family: 'user2'
+ }
+ ]);
diff --git a/src/rocksdb/tools/rdb/README.md b/src/rocksdb/tools/rdb/README.md
new file mode 100644
index 000000000..f69b3f7b1
--- /dev/null
+++ b/src/rocksdb/tools/rdb/README.md
@@ -0,0 +1,40 @@
+# RDB - RocksDB Shell
+
+RDB is a NodeJS-based shell interface to RocksDB. It can also be used as a
+JavaScript binding for RocksDB within a Node application.
+
+## Setup/Compilation
+
+### Requirements
+
+* static RocksDB library (i.e. librocksdb.a)
+* libsnappy
+* node (tested onv0.10.33, no guarantees on anything else!)
+* node-gyp
+* python2 (for node-gyp; tested with 2.7.8)
+
+### Installation
+
+NOTE: If your default `python` binary is not a version of python2, add
+the arguments `--python /path/to/python2` to the `node-gyp` commands.
+
+1. Make sure you have the static library (i.e. "librocksdb.a") in the root
+directory of your rocksdb installation. If not, `cd` there and run
+`make static_lib`.
+
+2. Run `node-gyp configure` to generate the build.
+
+3. Run `node-gyp build` to compile RDB.
+
+## Usage
+
+### Running the shell
+
+Assuming everything compiled correctly, you can run the `rdb` executable
+located in the root of the `tools/rdb` directory to start the shell. The file is
+just a shell script that runs the node shell and loads the constructor for the
+RDB object into the top-level function `RDB`.
+
+### JavaScript API
+
+See `API.md` for how to use RocksDB from the shell.
diff --git a/src/rocksdb/tools/rdb/binding.gyp b/src/rocksdb/tools/rdb/binding.gyp
new file mode 100644
index 000000000..89145541c
--- /dev/null
+++ b/src/rocksdb/tools/rdb/binding.gyp
@@ -0,0 +1,25 @@
+{
+ "targets": [
+ {
+ "target_name": "rdb",
+ "sources": [
+ "rdb.cc",
+ "db_wrapper.cc",
+ "db_wrapper.h"
+ ],
+ "cflags_cc!": [
+ "-fno-exceptions"
+ ],
+ "cflags_cc+": [
+ "-std=c++11",
+ ],
+ "include_dirs+": [
+ "../../include"
+ ],
+ "libraries": [
+ "../../../librocksdb.a",
+ "-lsnappy"
+ ],
+ }
+ ]
+}
diff --git a/src/rocksdb/tools/rdb/db_wrapper.cc b/src/rocksdb/tools/rdb/db_wrapper.cc
new file mode 100644
index 000000000..632cf05a8
--- /dev/null
+++ b/src/rocksdb/tools/rdb/db_wrapper.cc
@@ -0,0 +1,526 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include <iostream>
+#include <memory>
+#include <vector>
+#include <v8.h>
+#include <node.h>
+
+#include "db/_wrapper.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+
+namespace {
+ void printWithBackSlashes(std::string str) {
+ for (std::string::size_type i = 0; i < str.size(); i++) {
+ if (str[i] == '\\' || str[i] == '"') {
+ std::cout << "\\";
+ }
+
+ std::cout << str[i];
+ }
+ }
+
+ bool has_key_for_array(Local<Object> obj, std::string key) {
+ return obj->Has(String::NewSymbol(key.c_str())) &&
+ obj->Get(String::NewSymbol(key.c_str()))->IsArray();
+ }
+}
+
+using namespace v8;
+
+
+Persistent<Function> DBWrapper::constructor;
+
+DBWrapper::DBWrapper() {
+ options_.IncreaseParallelism();
+ options_.OptimizeLevelStyleCompaction();
+ options_.disable_auto_compactions = true;
+ options_.create_if_missing = true;
+}
+
+DBWrapper::~DBWrapper() {
+ delete db_;
+}
+
+bool DBWrapper::HasFamilyNamed(std::string& name, DBWrapper* db) {
+ return db->columnFamilies_.find(name) != db->columnFamilies_.end();
+}
+
+
+void DBWrapper::Init(Handle<Object> exports) {
+ Local<FunctionTemplate> tpl = FunctionTemplate::New(New);
+ tpl->SetClassName(String::NewSymbol("DBWrapper"));
+ tpl->InstanceTemplate()->SetInternalFieldCount(8);
+ tpl->PrototypeTemplate()->Set(String::NewSymbol("open"),
+ FunctionTemplate::New(Open)->GetFunction());
+ tpl->PrototypeTemplate()->Set(String::NewSymbol("get"),
+ FunctionTemplate::New(Get)->GetFunction());
+ tpl->PrototypeTemplate()->Set(String::NewSymbol("put"),
+ FunctionTemplate::New(Put)->GetFunction());
+ tpl->PrototypeTemplate()->Set(String::NewSymbol("delete"),
+ FunctionTemplate::New(Delete)->GetFunction());
+ tpl->PrototypeTemplate()->Set(String::NewSymbol("dump"),
+ FunctionTemplate::New(Dump)->GetFunction());
+ tpl->PrototypeTemplate()->Set(String::NewSymbol("createColumnFamily"),
+ FunctionTemplate::New(CreateColumnFamily)->GetFunction());
+ tpl->PrototypeTemplate()->Set(String::NewSymbol("writeBatch"),
+ FunctionTemplate::New(WriteBatch)->GetFunction());
+ tpl->PrototypeTemplate()->Set(String::NewSymbol("compactRange"),
+ FunctionTemplate::New(CompactRange)->GetFunction());
+
+ constructor = Persistent<Function>::New(tpl->GetFunction());
+ exports->Set(String::NewSymbol("DBWrapper"), constructor);
+}
+
+Handle<Value> DBWrapper::Open(const Arguments& args) {
+ HandleScope scope;
+ DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+
+ if (!(args[0]->IsString() &&
+ (args[1]->IsUndefined() || args[1]->IsArray()))) {
+ return scope.Close(Boolean::New(false));
+ }
+
+ std::string db_file = *v8::String::Utf8Value(args[0]->ToString());
+
+ std::vector<std::string> cfs = {ROCKSDB_NAMESPACE::kDefaultColumnFamilyName};
+
+ if (!args[1]->IsUndefined()) {
+ Handle<Array> array = Handle<Array>::Cast(args[1]);
+ for (uint i = 0; i < array->Length(); i++) {
+ if (!array->Get(i)->IsString()) {
+ return scope.Close(Boolean::New(false));
+ }
+
+ cfs.push_back(*v8::String::Utf8Value(array->Get(i)->ToString()));
+ }
+ }
+
+ if (cfs.size() == 1) {
+ db_wrapper->status_ = ROCKSDB_NAMESPACE::DB::Open(
+ db_wrapper->options_, db_file, &db_wrapper->db_);
+
+ return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+ }
+
+ std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> families;
+
+ for (std::vector<int>::size_type i = 0; i < cfs.size(); i++) {
+ families.push_back(ROCKSDB_NAMESPACE::ColumnFamilyDescriptor(
+ cfs[i], ROCKSDB_NAMESPACE::ColumnFamilyOptions()));
+ }
+
+ std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> handles;
+ db_wrapper->status_ = ROCKSDB_NAMESPACE::DB::Open(
+ db_wrapper->options_, db_file, families, &handles, &db_wrapper->db_);
+
+ if (!db_wrapper->status_.ok()) {
+ return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+ }
+
+ for (std::vector<int>::size_type i = 0; i < handles.size(); i++) {
+ db_wrapper->columnFamilies_[cfs[i]] = handles[i];
+ }
+
+ return scope.Close(Boolean::New(true));
+}
+
+
+Handle<Value> DBWrapper::New(const Arguments& args) {
+ HandleScope scope;
+ Handle<Value> to_return;
+
+ if (args.IsConstructCall()) {
+ DBWrapper* db_wrapper = new DBWrapper();
+ db_wrapper->Wrap(args.This());
+
+ return args.This();
+ }
+
+ const int argc = 0;
+ Local<Value> argv[0] = {};
+
+ return scope.Close(constructor->NewInstance(argc, argv));
+}
+
+Handle<Value> DBWrapper::Get(const Arguments& args) {
+ HandleScope scope;
+
+ if (!(args[0]->IsString() &&
+ (args[1]->IsUndefined() || args[1]->IsString()))) {
+ return scope.Close(Null());
+ }
+
+ DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+ std::string key = *v8::String::Utf8Value(args[0]->ToString());
+ std::string cf = *v8::String::Utf8Value(args[1]->ToString());
+ std::string value;
+
+ if (args[1]->IsUndefined()) {
+ db_wrapper->status_ =
+ db_wrapper->db_->Get(ROCKSDB_NAMESPACE::ReadOptions(), key, &value);
+ } else if (db_wrapper->HasFamilyNamed(cf, db_wrapper)) {
+ db_wrapper->status_ =
+ db_wrapper->db_->Get(ROCKSDB_NAMESPACE::ReadOptions(),
+ db_wrapper->columnFamilies_[cf], key, &value);
+ } else {
+ return scope.Close(Null());
+ }
+
+ Handle<Value> v = db_wrapper->status_.ok() ?
+ String::NewSymbol(value.c_str()) : Null();
+
+ return scope.Close(v);
+}
+
+Handle<Value> DBWrapper::Put(const Arguments& args) {
+ HandleScope scope;
+
+ if (!(args[0]->IsString() && args[1]->IsString() &&
+ (args[2]->IsUndefined() || args[2]->IsString()))) {
+ return scope.Close(Boolean::New(false));
+ }
+
+ DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+ std::string key = *v8::String::Utf8Value(args[0]->ToString());
+ std::string value = *v8::String::Utf8Value(args[1]->ToString());
+ std::string cf = *v8::String::Utf8Value(args[2]->ToString());
+
+ if (args[2]->IsUndefined()) {
+ db_wrapper->status_ =
+ db_wrapper->db_->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, value);
+ } else if (db_wrapper->HasFamilyNamed(cf, db_wrapper)) {
+ db_wrapper->status_ =
+ db_wrapper->db_->Put(ROCKSDB_NAMESPACE::WriteOptions(),
+ db_wrapper->columnFamilies_[cf], key, value);
+ } else {
+ return scope.Close(Boolean::New(false));
+ }
+
+
+ return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::Delete(const Arguments& args) {
+ HandleScope scope;
+
+ if (!args[0]->IsString()) {
+ return scope.Close(Boolean::New(false));
+ }
+
+ DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+ std::string arg0 = *v8::String::Utf8Value(args[0]->ToString());
+ std::string arg1 = *v8::String::Utf8Value(args[1]->ToString());
+
+ if (args[1]->IsUndefined()) {
+ db_wrapper->status_ =
+ db_wrapper->db_->Delete(ROCKSDB_NAMESPACE::WriteOptions(), arg0);
+ } else {
+ if (!db_wrapper->HasFamilyNamed(arg1, db_wrapper)) {
+ return scope.Close(Boolean::New(false));
+ }
+ db_wrapper->status_ =
+ db_wrapper->db_->Delete(ROCKSDB_NAMESPACE::WriteOptions(),
+ db_wrapper->columnFamilies_[arg1], arg0);
+ }
+
+ return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::Dump(const Arguments& args) {
+ HandleScope scope;
+ std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> iterator;
+ DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+ std::string arg0 = *v8::String::Utf8Value(args[0]->ToString());
+
+ if (args[0]->IsUndefined()) {
+ iterator.reset(
+ db_wrapper->db_->NewIterator(ROCKSDB_NAMESPACE::ReadOptions()));
+ } else {
+ if (!db_wrapper->HasFamilyNamed(arg0, db_wrapper)) {
+ return scope.Close(Boolean::New(false));
+ }
+
+ iterator.reset(db_wrapper->db_->NewIterator(
+ ROCKSDB_NAMESPACE::ReadOptions(), db_wrapper->columnFamilies_[arg0]));
+ }
+
+ iterator->SeekToFirst();
+
+ while (iterator->Valid()) {
+ std::cout << "\"";
+ printWithBackSlashes(iterator->key().ToString());
+ std::cout << "\" => \"";
+ printWithBackSlashes(iterator->value().ToString());
+ std::cout << "\"\n";
+ iterator->Next();
+ }
+
+ return scope.Close(Boolean::New(true));
+}
+
+Handle<Value> DBWrapper::CreateColumnFamily(const Arguments& args) {
+ HandleScope scope;
+
+ if (!args[0]->IsString()) {
+ return scope.Close(Boolean::New(false));
+ }
+
+ DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+ std::string cf_name = *v8::String::Utf8Value(args[0]->ToString());
+
+ if (db_wrapper->HasFamilyNamed(cf_name, db_wrapper)) {
+ return scope.Close(Boolean::New(false));
+ }
+
+ ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf;
+ db_wrapper->status_ = db_wrapper->db_->CreateColumnFamily(
+ ROCKSDB_NAMESPACE::ColumnFamilyOptions(), cf_name, &cf);
+
+ if (!db_wrapper->status_.ok()) {
+ return scope.Close(Boolean::New(false));
+ }
+
+ db_wrapper->columnFamilies_[cf_name] = cf;
+
+ return scope.Close(Boolean::New(true));
+}
+
+bool DBWrapper::AddToBatch(ROCKSDB_NAMESPACE::WriteBatch& batch, bool del,
+ Handle<Array> array) {
+ Handle<Array> put_pair;
+ for (uint i = 0; i < array->Length(); i++) {
+ if (del) {
+ if (!array->Get(i)->IsString()) {
+ return false;
+ }
+
+ batch.Delete(*v8::String::Utf8Value(array->Get(i)->ToString()));
+ continue;
+ }
+
+ if (!array->Get(i)->IsArray()) {
+ return false;
+ }
+
+ put_pair = Handle<Array>::Cast(array->Get(i));
+
+ if (!put_pair->Get(0)->IsString() || !put_pair->Get(1)->IsString()) {
+ return false;
+ }
+
+ batch.Put(
+ *v8::String::Utf8Value(put_pair->Get(0)->ToString()),
+ *v8::String::Utf8Value(put_pair->Get(1)->ToString()));
+ }
+
+ return true;
+}
+
+bool DBWrapper::AddToBatch(ROCKSDB_NAMESPACE::WriteBatch& batch, bool del,
+ Handle<Array> array, DBWrapper* db_wrapper,
+ std::string cf) {
+ Handle<Array> put_pair;
+ for (uint i = 0; i < array->Length(); i++) {
+ if (del) {
+ if (!array->Get(i)->IsString()) {
+ return false;
+ }
+
+ batch.Delete(
+ db_wrapper->columnFamilies_[cf],
+ *v8::String::Utf8Value(array->Get(i)->ToString()));
+ continue;
+ }
+
+ if (!array->Get(i)->IsArray()) {
+ return false;
+ }
+
+ put_pair = Handle<Array>::Cast(array->Get(i));
+
+ if (!put_pair->Get(0)->IsString() || !put_pair->Get(1)->IsString()) {
+ return false;
+ }
+
+ batch.Put(
+ db_wrapper->columnFamilies_[cf],
+ *v8::String::Utf8Value(put_pair->Get(0)->ToString()),
+ *v8::String::Utf8Value(put_pair->Get(1)->ToString()));
+ }
+
+ return true;
+}
+
+Handle<Value> DBWrapper::WriteBatch(const Arguments& args) {
+ HandleScope scope;
+
+ if (!args[0]->IsArray()) {
+ return scope.Close(Boolean::New(false));
+ }
+
+ DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+ Handle<Array> sub_batches = Handle<Array>::Cast(args[0]);
+ Local<Object> sub_batch;
+ ROCKSDB_NAMESPACE::WriteBatch batch;
+ bool well_formed;
+
+ for (uint i = 0; i < sub_batches->Length(); i++) {
+ if (!sub_batches->Get(i)->IsObject()) {
+ return scope.Close(Boolean::New(false));
+ }
+ sub_batch = sub_batches->Get(i)->ToObject();
+
+ if (sub_batch->Has(String::NewSymbol("column_family"))) {
+ if (!has_key_for_array(sub_batch, "put") &&
+ !has_key_for_array(sub_batch, "delete")) {
+ return scope.Close(Boolean::New(false));
+ }
+
+ well_formed = db_wrapper->AddToBatch(
+ batch, false,
+ Handle<Array>::Cast(sub_batch->Get(String::NewSymbol("put"))),
+ db_wrapper, *v8::String::Utf8Value(sub_batch->Get(
+ String::NewSymbol("column_family"))));
+
+ well_formed = db_wrapper->AddToBatch(
+ batch, true,
+ Handle<Array>::Cast(sub_batch->Get(String::NewSymbol("delete"))),
+ db_wrapper, *v8::String::Utf8Value(sub_batch->Get(
+ String::NewSymbol("column_family"))));
+ } else {
+ well_formed = db_wrapper->AddToBatch(
+ batch, false,
+ Handle<Array>::Cast(sub_batch->Get(String::NewSymbol("put"))));
+ well_formed = db_wrapper->AddToBatch(
+ batch, true,
+ Handle<Array>::Cast(sub_batch->Get(String::NewSymbol("delete"))));
+
+ if (!well_formed) {
+ return scope.Close(Boolean::New(false));
+ }
+ }
+ }
+
+ db_wrapper->status_ =
+ db_wrapper->db_->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch);
+
+ return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::CompactRangeDefault(const Arguments& args) {
+ HandleScope scope;
+
+ DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+ ROCKSDB_NAMESPACE::Slice begin = *v8::String::Utf8Value(args[0]->ToString());
+ ROCKSDB_NAMESPACE::Slice end = *v8::String::Utf8Value(args[1]->ToString());
+ db_wrapper->status_ = db_wrapper->db_->CompactRange(&end, &begin);
+
+ return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::CompactColumnFamily(const Arguments& args) {
+ HandleScope scope;
+
+ DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+ ROCKSDB_NAMESPACE::Slice begin = *v8::String::Utf8Value(args[0]->ToString());
+ ROCKSDB_NAMESPACE::Slice end = *v8::String::Utf8Value(args[1]->ToString());
+ std::string cf = *v8::String::Utf8Value(args[2]->ToString());
+ db_wrapper->status_ = db_wrapper->db_->CompactRange(
+ db_wrapper->columnFamilies_[cf], &begin, &end);
+
+ return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::CompactOptions(const Arguments& args) {
+ HandleScope scope;
+
+ if (!args[2]->IsObject()) {
+ return scope.Close(Boolean::New(false));
+ }
+
+ DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+ ROCKSDB_NAMESPACE::Slice begin = *v8::String::Utf8Value(args[0]->ToString());
+ ROCKSDB_NAMESPACE::Slice end = *v8::String::Utf8Value(args[1]->ToString());
+ Local<Object> options = args[2]->ToObject();
+ int target_level = -1, target_path_id = 0;
+
+ if (options->Has(String::NewSymbol("target_level")) &&
+ options->Get(String::NewSymbol("target_level"))->IsInt32()) {
+ target_level = (int)(options->Get(
+ String::NewSymbol("target_level"))->ToInt32()->Value());
+
+ if (options->Has(String::NewSymbol("target_path_id")) ||
+ options->Get(String::NewSymbol("target_path_id"))->IsInt32()) {
+ target_path_id = (int)(options->Get(
+ String::NewSymbol("target_path_id"))->ToInt32()->Value());
+ }
+ }
+
+ db_wrapper->status_ = db_wrapper->db_->CompactRange(
+ &begin, &end, true, target_level, target_path_id
+ );
+
+ return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::CompactAll(const Arguments& args) {
+ HandleScope scope;
+
+ if (!args[2]->IsObject() || !args[3]->IsString()) {
+ return scope.Close(Boolean::New(false));
+ }
+
+ DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+ ROCKSDB_NAMESPACE::Slice begin = *v8::String::Utf8Value(args[0]->ToString());
+ ROCKSDB_NAMESPACE::Slice end = *v8::String::Utf8Value(args[1]->ToString());
+ Local<Object> options = args[2]->ToObject();
+ std::string cf = *v8::String::Utf8Value(args[3]->ToString());
+
+ int target_level = -1, target_path_id = 0;
+
+ if (options->Has(String::NewSymbol("target_level")) &&
+ options->Get(String::NewSymbol("target_level"))->IsInt32()) {
+ target_level = (int)(options->Get(
+ String::NewSymbol("target_level"))->ToInt32()->Value());
+
+ if (options->Has(String::NewSymbol("target_path_id")) ||
+ options->Get(String::NewSymbol("target_path_id"))->IsInt32()) {
+ target_path_id = (int)(options->Get(
+ String::NewSymbol("target_path_id"))->ToInt32()->Value());
+ }
+ }
+
+ db_wrapper->status_ = db_wrapper->db_->CompactRange(
+ db_wrapper->columnFamilies_[cf], &begin, &end, true, target_level,
+ target_path_id);
+
+ return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::CompactRange(const Arguments& args) {
+ HandleScope scope;
+
+ if (!args[0]->IsString() || !args[1]->IsString()) {
+ return scope.Close(Boolean::New(false));
+ }
+
+ switch(args.Length()) {
+ case 2:
+ return CompactRangeDefault(args);
+ case 3:
+ return args[2]->IsString() ? CompactColumnFamily(args) :
+ CompactOptions(args);
+ default:
+ return CompactAll(args);
+ }
+}
+
+Handle<Value> DBWrapper::Close(const Arguments& args) {
+ HandleScope scope;
+
+ delete ObjectWrap::Unwrap<DBWrapper>(args.This());
+
+ return scope.Close(Null());
+}
diff --git a/src/rocksdb/tools/rdb/db_wrapper.h b/src/rocksdb/tools/rdb/db_wrapper.h
new file mode 100644
index 000000000..4b57320cd
--- /dev/null
+++ b/src/rocksdb/tools/rdb/db_wrapper.h
@@ -0,0 +1,60 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#ifndef DBWRAPPER_H
+#define DBWRAPPER_H
+
+#include <map>
+#include <node.h>
+
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/options.h"
+
+using namespace v8;
+
+// Used to encapsulate a particular instance of an opened database.
+//
+// This object should not be used directly in C++; it exists solely to provide
+// a mapping from a JavaScript object to a C++ code that can use the RocksDB
+// API.
+class DBWrapper : public node::ObjectWrap {
+ public:
+ static void Init(Handle<Object> exports);
+
+ private:
+ explicit DBWrapper();
+ ~DBWrapper();
+
+ // Helper methods
+ static bool HasFamilyNamed(std::string& name, DBWrapper* db);
+ static bool AddToBatch(ROCKSDB_NAMESPACE::WriteBatch& batch, bool del,
+ Handle<Array> array);
+ static bool AddToBatch(ROCKSDB_NAMESPACE::WriteBatch& batch, bool del,
+ Handle<Array> array, DBWrapper* db_wrapper,
+ std::string cf);
+ static Handle<Value> CompactRangeDefault(const v8::Arguments& args);
+ static Handle<Value> CompactColumnFamily(const Arguments& args);
+ static Handle<Value> CompactOptions(const Arguments& args);
+ static Handle<Value> CompactAll(const Arguments& args);
+
+ // C++ mappings of API methods
+ static Persistent<v8::Function> constructor;
+ static Handle<Value> Open(const Arguments& args);
+ static Handle<Value> New(const Arguments& args);
+ static Handle<Value> Get(const Arguments& args);
+ static Handle<Value> Put(const Arguments& args);
+ static Handle<Value> Delete(const Arguments& args);
+ static Handle<Value> Dump(const Arguments& args);
+ static Handle<Value> WriteBatch(const Arguments& args);
+ static Handle<Value> CreateColumnFamily(const Arguments& args);
+ static Handle<Value> CompactRange(const Arguments& args);
+ static Handle<Value> Close(const Arguments& args);
+
+ // Internal fields
+ ROCKSDB_NAMESPACE::Options options_;
+ ROCKSDB_NAMESPACE::Status status_;
+ ROCKSDB_NAMESPACE::DB* db_;
+ std::unordered_map<std::string, ROCKSDB_NAMESPACE::ColumnFamilyHandle*>
+ columnFamilies_;
+};
+
+#endif
diff --git a/src/rocksdb/tools/rdb/rdb b/src/rocksdb/tools/rdb/rdb
new file mode 100755
index 000000000..05da1158b
--- /dev/null
+++ b/src/rocksdb/tools/rdb/rdb
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+node -e "RDB = require('./build/Release/rdb').DBWrapper; console.log('Loaded rocksdb in variable RDB'); repl = require('repl').start('> ');"
diff --git a/src/rocksdb/tools/rdb/rdb.cc b/src/rocksdb/tools/rdb/rdb.cc
new file mode 100644
index 000000000..119fcc410
--- /dev/null
+++ b/src/rocksdb/tools/rdb/rdb.cc
@@ -0,0 +1,16 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#ifndef BUILDING_NODE_EXTENSION
+#define BUILDING_NODE_EXTENSION
+#endif
+
+#include <node.h>
+#include <v8.h>
+#include "db/_wrapper.h"
+
+using namespace v8;
+
+void InitAll(Handle<Object> exports) {
+ DBWrapper::Init(exports);
+}
+
+NODE_MODULE(rdb, InitAll)
diff --git a/src/rocksdb/tools/rdb/unit_test.js b/src/rocksdb/tools/rdb/unit_test.js
new file mode 100644
index 000000000..d01ae1df8
--- /dev/null
+++ b/src/rocksdb/tools/rdb/unit_test.js
@@ -0,0 +1,125 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+assert = require('assert')
+RDB = require('./build/Release/rdb').DBWrapper
+exec = require('child_process').exec
+util = require('util')
+
+DB_NAME = '/tmp/rocksdbtest-' + process.getuid()
+
+a = RDB()
+assert.equal(a.open(DB_NAME, ['b']), false)
+
+exec(
+ util.format(
+ "node -e \"RDB = require('./build/Release/rdb').DBWrapper; \
+ a = RDB('%s'); a.createColumnFamily('b')\"",
+ DB_NAME
+ ).exitCode, null
+)
+
+
+exec(
+ util.format(
+ "node -e \"RDB = require('./build/Release/rdb').DBWrapper; \
+ a = RDB('%s', ['b'])\"",
+ DB_NAME
+ ).exitCode, null
+)
+
+exec('rm -rf ' + DB_NAME)
+
+a = RDB()
+assert.equal(a.open(DB_NAME, ['a']), false)
+assert(a.open(DB_NAME), true)
+assert(a.createColumnFamily('temp'))
+
+b = RDB()
+assert.equal(b.open(DB_NAME), false)
+
+exec('rm -rf ' + DB_NAME)
+
+DB_NAME += 'b'
+
+a = RDB()
+assert(a.open(DB_NAME))
+assert.equal(a.constructor.name, 'DBWrapper')
+assert.equal(a.createColumnFamily(), false)
+assert.equal(a.createColumnFamily(1), false)
+assert.equal(a.createColumnFamily(['']), false)
+assert(a.createColumnFamily('b'))
+assert.equal(a.createColumnFamily('b'), false)
+
+// Get and Put
+assert.equal(a.get(1), null)
+assert.equal(a.get(['a']), null)
+assert.equal(a.get('a', 1), null)
+assert.equal(a.get(1, 'a'), null)
+assert.equal(a.get(1, 1), null)
+
+assert.equal(a.put(1), false)
+assert.equal(a.put(['a']), false)
+assert.equal(a.put('a', 1), false)
+assert.equal(a.put(1, 'a'), false)
+assert.equal(a.put(1, 1), false)
+assert.equal(a.put('a', 'a', 1), false)
+assert.equal(a.put('a', 1, 'a'), false)
+assert.equal(a.put(1, 'a', 'a'), false)
+assert.equal(a.put('a', 1, 1), false)
+assert.equal(a.put(1, 'a', 1), false)
+assert.equal(a.put(1, 1, 'a'), false)
+assert.equal(a.put(1, 1, 1), false)
+
+
+assert.equal(a.get(), null)
+assert.equal(a.get('a'), null)
+assert.equal(a.get('a', 'c'), null)
+assert.equal(a.put(), false)
+assert.equal(a.put('a'), false)
+assert.equal(a.get('a', 'b', 'c'), null)
+
+assert(a.put('a', 'axe'))
+assert(a.put('a', 'first'))
+assert.equal(a.get('a'), 'first')
+assert.equal(a.get('a', 'b'), null)
+assert.equal(a.get('a', 'c'), null)
+
+assert(a.put('a', 'apple', 'b'))
+assert.equal(a.get('a', 'b'), 'apple')
+assert.equal(a.get('a'), 'first')
+assert(a.put('b', 'butter', 'b'), 'butter')
+assert(a.put('b', 'banana', 'b'))
+assert.equal(a.get('b', 'b'), 'banana')
+assert.equal(a.get('b'), null)
+assert.equal(a.get('b', 'c'), null)
+
+// Delete
+assert.equal(a.delete(1), false)
+assert.equal(a.delete('a', 1), false)
+assert.equal(a.delete(1, 'a'), false)
+assert.equal(a.delete(1, 1), false)
+
+assert.equal(a.delete('b'), true)
+assert(a.delete('a'))
+assert.equal(a.get('a'), null)
+assert.equal(a.get('a', 'b'), 'apple')
+assert.equal(a.delete('c', 'c'), false)
+assert.equal(a.delete('c', 'b'), true)
+assert(a.delete('b', 'b'))
+assert.equal(a.get('b', 'b'), null)
+
+// Dump
+console.log("MARKER 1")
+assert(a.dump())
+console.log("Should be no output between 'MARKER 1' and here\n")
+console.log('Next line should be "a" => "apple"')
+assert(a.dump('b'))
+
+console.log("\nMARKER 2")
+assert.equal(a.dump('c'), false)
+console.log("Should be no output between 'MARKER 2' and here\n")
+
+// WriteBatch
+
+
+// Clean up test database
+exec('rm -rf ' + DB_NAME)
diff --git a/src/rocksdb/tools/reduce_levels_test.cc b/src/rocksdb/tools/reduce_levels_test.cc
new file mode 100644
index 000000000..1b1044f95
--- /dev/null
+++ b/src/rocksdb/tools/reduce_levels_test.cc
@@ -0,0 +1,220 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "tools/ldb_cmd_impl.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ReduceLevelTest : public testing::Test {
+public:
+ ReduceLevelTest() {
+ dbname_ = test::PerThreadDBPath("db_reduce_levels_test");
+ DestroyDB(dbname_, Options());
+ db_ = nullptr;
+ }
+
+ Status OpenDB(bool create_if_missing, int levels);
+
+ Status Put(const std::string& k, const std::string& v) {
+ return db_->Put(WriteOptions(), k, v);
+ }
+
+ std::string Get(const std::string& k) {
+ ReadOptions options;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ Status Flush() {
+ if (db_ == nullptr) {
+ return Status::InvalidArgument("DB not opened.");
+ }
+ DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
+ return db_impl->TEST_FlushMemTable();
+ }
+
+ void MoveL0FileToLevel(int level) {
+ DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
+ for (int i = 0; i < level; ++i) {
+ ASSERT_OK(db_impl->TEST_CompactRange(i, nullptr, nullptr));
+ }
+ }
+
+ void CloseDB() {
+ if (db_ != nullptr) {
+ delete db_;
+ db_ = nullptr;
+ }
+ }
+
+ bool ReduceLevels(int target_level);
+
+ int FilesOnLevel(int level) {
+ std::string property;
+ EXPECT_TRUE(db_->GetProperty(
+ "rocksdb.num-files-at-level" + NumberToString(level), &property));
+ return atoi(property.c_str());
+ }
+
+private:
+ std::string dbname_;
+ DB* db_;
+};
+
+Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels) {
+ ROCKSDB_NAMESPACE::Options opt;
+ opt.num_levels = num_levels;
+ opt.create_if_missing = create_if_missing;
+ ROCKSDB_NAMESPACE::Status st =
+ ROCKSDB_NAMESPACE::DB::Open(opt, dbname_, &db_);
+ if (!st.ok()) {
+ fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str());
+ }
+ return st;
+}
+
+bool ReduceLevelTest::ReduceLevels(int target_level) {
+ std::vector<std::string> args =
+ ROCKSDB_NAMESPACE::ReduceDBLevelsCommand::PrepareArgs(
+ dbname_, target_level, false);
+ LDBCommand* level_reducer = LDBCommand::InitFromCmdLineArgs(
+ args, Options(), LDBOptions(), nullptr, LDBCommand::SelectCommand);
+ level_reducer->Run();
+ bool is_succeed = level_reducer->GetExecuteState().IsSucceed();
+ delete level_reducer;
+ return is_succeed;
+}
+
+TEST_F(ReduceLevelTest, Last_Level) {
+ ASSERT_OK(OpenDB(true, 4));
+ ASSERT_OK(Put("aaaa", "11111"));
+ Flush();
+ MoveL0FileToLevel(3);
+ ASSERT_EQ(FilesOnLevel(3), 1);
+ CloseDB();
+
+ ASSERT_TRUE(ReduceLevels(3));
+ ASSERT_OK(OpenDB(true, 3));
+ ASSERT_EQ(FilesOnLevel(2), 1);
+ CloseDB();
+
+ ASSERT_TRUE(ReduceLevels(2));
+ ASSERT_OK(OpenDB(true, 2));
+ ASSERT_EQ(FilesOnLevel(1), 1);
+ CloseDB();
+}
+
+TEST_F(ReduceLevelTest, Top_Level) {
+ ASSERT_OK(OpenDB(true, 5));
+ ASSERT_OK(Put("aaaa", "11111"));
+ Flush();
+ ASSERT_EQ(FilesOnLevel(0), 1);
+ CloseDB();
+
+ ASSERT_TRUE(ReduceLevels(4));
+ ASSERT_OK(OpenDB(true, 4));
+ CloseDB();
+
+ ASSERT_TRUE(ReduceLevels(3));
+ ASSERT_OK(OpenDB(true, 3));
+ CloseDB();
+
+ ASSERT_TRUE(ReduceLevels(2));
+ ASSERT_OK(OpenDB(true, 2));
+ CloseDB();
+}
+
+TEST_F(ReduceLevelTest, All_Levels) {
+ ASSERT_OK(OpenDB(true, 5));
+ ASSERT_OK(Put("a", "a11111"));
+ ASSERT_OK(Flush());
+ MoveL0FileToLevel(4);
+ ASSERT_EQ(FilesOnLevel(4), 1);
+ CloseDB();
+
+ ASSERT_OK(OpenDB(true, 5));
+ ASSERT_OK(Put("b", "b11111"));
+ ASSERT_OK(Flush());
+ MoveL0FileToLevel(3);
+ ASSERT_EQ(FilesOnLevel(3), 1);
+ ASSERT_EQ(FilesOnLevel(4), 1);
+ CloseDB();
+
+ ASSERT_OK(OpenDB(true, 5));
+ ASSERT_OK(Put("c", "c11111"));
+ ASSERT_OK(Flush());
+ MoveL0FileToLevel(2);
+ ASSERT_EQ(FilesOnLevel(2), 1);
+ ASSERT_EQ(FilesOnLevel(3), 1);
+ ASSERT_EQ(FilesOnLevel(4), 1);
+ CloseDB();
+
+ ASSERT_OK(OpenDB(true, 5));
+ ASSERT_OK(Put("d", "d11111"));
+ ASSERT_OK(Flush());
+ MoveL0FileToLevel(1);
+ ASSERT_EQ(FilesOnLevel(1), 1);
+ ASSERT_EQ(FilesOnLevel(2), 1);
+ ASSERT_EQ(FilesOnLevel(3), 1);
+ ASSERT_EQ(FilesOnLevel(4), 1);
+ CloseDB();
+
+ ASSERT_TRUE(ReduceLevels(4));
+ ASSERT_OK(OpenDB(true, 4));
+ ASSERT_EQ("a11111", Get("a"));
+ ASSERT_EQ("b11111", Get("b"));
+ ASSERT_EQ("c11111", Get("c"));
+ ASSERT_EQ("d11111", Get("d"));
+ CloseDB();
+
+ ASSERT_TRUE(ReduceLevels(3));
+ ASSERT_OK(OpenDB(true, 3));
+ ASSERT_EQ("a11111", Get("a"));
+ ASSERT_EQ("b11111", Get("b"));
+ ASSERT_EQ("c11111", Get("c"));
+ ASSERT_EQ("d11111", Get("d"));
+ CloseDB();
+
+ ASSERT_TRUE(ReduceLevels(2));
+ ASSERT_OK(OpenDB(true, 2));
+ ASSERT_EQ("a11111", Get("a"));
+ ASSERT_EQ("b11111", Get("b"));
+ ASSERT_EQ("c11111", Get("c"));
+ ASSERT_EQ("d11111", Get("d"));
+ CloseDB();
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as LDBCommand is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/tools/regression_test.sh b/src/rocksdb/tools/regression_test.sh
new file mode 100755
index 000000000..79963738f
--- /dev/null
+++ b/src/rocksdb/tools/regression_test.sh
@@ -0,0 +1,470 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# The RocksDB regression test script.
+# REQUIREMENT: must be able to run make db_bench in the current directory
+#
+# This script will do the following things in order:
+#
+# 1. check out the specified rocksdb commit.
+# 2. build db_bench using the specified commit
+# 3. setup test directory $TEST_PATH. If not specified, then the test directory
+# will be "/tmp/rocksdb/regression_test"
+# 4. run set of benchmarks on the specified host
+# (can be either locally or remotely)
+# 5. generate report in the $RESULT_PATH. If RESULT_PATH is not specified,
+# RESULT_PATH will be set to $TEST_PATH/current_time
+#
+# = Examples =
+# * Run the regression test using rocksdb commit abcdef that outputs results
+# and temp files in "/my/output/dir"
+#r
+# TEST_PATH=/my/output/dir COMMIT_ID=abcdef ./tools/regression_test.sh
+#
+# * Run the regression test on a remost host under "/my/output/dir" directory
+# and stores the result locally in "/my/benchmark/results" using commit
+# abcdef and with the rocksdb options specified in /my/path/to/OPTIONS-012345
+# with 1000000000 keys in each benchmark in the regression test where each
+# key and value are 100 and 900 bytes respectively:
+#
+# REMOTE_USER_AT_HOST=yhchiang@my.remote.host \
+# TEST_PATH=/my/output/dir \
+# RESULT_PATH=/my/benchmark/results \
+# COMMIT_ID=abcdef \
+# OPTIONS_FILE=/my/path/to/OPTIONS-012345 \
+# NUM_KEYS=1000000000 \
+# KEY_SIZE=100 \
+# VALUE_SIZE=900 \
+# ./tools/regression_test.sh
+#
+# = Regression test environmental parameters =
+# DEBUG: If true, then the script will not checkout master and build db_bench
+# if db_bench already exists
+# Default: 0
+# TEST_MODE: If 1, run fillseqdeterminstic and benchmarks both
+# if 0, only run fillseqdeterministc
+# if 2, only run benchmarks
+# Default: 1
+# TEST_PATH: the root directory of the regression test.
+# Default: "/tmp/rocksdb/regression_test"
+# RESULT_PATH: the directory where the regression results will be generated.
+# Default: "$TEST_PATH/current_time"
+# REMOTE_USER_AT_HOST: If set, then test will run on the specified host under
+# TEST_PATH directory and outputs test results locally in RESULT_PATH
+# The REMOTE_USER_AT_HOST should follow the format user-id@host.name
+# DB_PATH: the path where the rocksdb database will be created during the
+# regression test. Default: $TEST_PATH/db
+# WAL_PATH: the path where the rocksdb WAL will be outputed.
+# Default: $TEST_PATH/wal
+# OPTIONS_FILE: If specified, then the regression test will use the specified
+# file to initialize the RocksDB options in its benchmarks. Note that
+# this feature only work for commits after 88acd93 or rocksdb version
+# later than 4.9.
+# DELETE_TEST_PATH: If true, then the test directory will be deleted
+# after the script ends.
+# Default: 0
+#
+# = db_bench parameters =
+# NUM_THREADS: The number of concurrent foreground threads that will issue
+# database operations in the benchmark. Default: 16.
+# NUM_KEYS: The key range that will be used in the entire regression test.
+# Default: 1G.
+# NUM_OPS: The number of operations (reads, writes, or deletes) that will
+# be issued in EACH thread.
+# Default: $NUM_KEYS / $NUM_THREADS
+# KEY_SIZE: The size of each key in bytes in db_bench. Default: 100.
+# VALUE_SIZE: The size of each value in bytes in db_bench. Default: 900.
+# CACHE_SIZE: The size of RocksDB block cache used in db_bench. Default: 1G
+# STATISTICS: If 1, then statistics is on in db_bench. Default: 0.
+# COMPRESSION_RATIO: The compression ratio of the key generated in db_bench.
+# Default: 0.5.
+# HISTOGRAM: If 1, then the histogram feature on performance feature is on.
+# STATS_PER_INTERVAL: If 1, then the statistics will be reported for every
+# STATS_INTERVAL_SECONDS seconds. Default 1.
+# STATS_INTERVAL_SECONDS: If STATS_PER_INTERVAL is set to 1, then statistics
+# will be reported for every STATS_INTERVAL_SECONDS. Default 60.
+# MAX_BACKGROUND_FLUSHES: The maxinum number of concurrent flushes in
+# db_bench. Default: 4.
+# MAX_BACKGROUND_COMPACTIONS: The maximum number of concurrent compactions
+# in db_bench. Default: 16.
+# NUM_HIGH_PRI_THREADS: The number of high-pri threads available for
+# concurrent flushes in db_bench. Default: 4.
+# NUM_LOW_PRI_THREADS: The number of low-pri threads available for
+# concurrent compactions in db_bench. Default: 16.
+# SEEK_NEXTS: Controls how many Next() will be called after seek.
+# Default: 10.
+# SEED: random seed that controls the randomness of the benchmark.
+# Default: $( date +%s )
+
+#==============================================================================
+# CONSTANT
+#==============================================================================
+TITLE_FORMAT="%40s,%25s,%30s,%7s,%9s,%8s,"
+TITLE_FORMAT+="%10s,%13s,%14s,%11s,%12s,"
+TITLE_FORMAT+="%7s,%11s,"
+TITLE_FORMAT+="%9s,%10s,%10s,%10s,%10s,%10s,%5s,"
+TITLE_FORMAT+="%5s,%5s,%5s" # time
+TITLE_FORMAT+="\n"
+
+DATA_FORMAT="%40s,%25s,%30s,%7s,%9s,%8s,"
+DATA_FORMAT+="%10s,%13.0f,%14s,%11s,%12s,"
+DATA_FORMAT+="%7s,%11s,"
+DATA_FORMAT+="%9.0f,%10.0f,%10.0f,%10.0f,%10.0f,%10.0f,%5.0f,"
+DATA_FORMAT+="%5.0f,%5.0f,%5.0f" # time
+DATA_FORMAT+="\n"
+
+MAIN_PATTERN="$1""[[:blank:]]+:.*[[:blank:]]+([0-9\.]+)[[:blank:]]+ops/sec"
+PERC_PATTERN="Percentiles: P50: ([0-9\.]+) P75: ([0-9\.]+) "
+PERC_PATTERN+="P99: ([0-9\.]+) P99.9: ([0-9\.]+) P99.99: ([0-9\.]+)"
+#==============================================================================
+
+function main {
+ TEST_ROOT_DIR=${TEST_PATH:-"/tmp/rocksdb/regression_test"}
+ init_arguments $TEST_ROOT_DIR
+
+ build_db_bench_and_ldb
+
+ setup_test_directory
+ if [ $TEST_MODE -le 1 ]; then
+ tmp=$DB_PATH
+ DB_PATH=$ORIGIN_PATH
+ test_remote "test -d $DB_PATH"
+ if [[ $? -ne 0 ]]; then
+ echo "Building DB..."
+ # compactall alone will not print ops or threads, which will fail update_report
+ run_db_bench "fillseq,compactall" $NUM_KEYS 1 0 0
+ fi
+ DB_PATH=$tmp
+ fi
+ if [ $TEST_MODE -ge 1 ]; then
+ build_checkpoint
+ run_db_bench "readrandom"
+ run_db_bench "readwhilewriting"
+ run_db_bench "deleterandom" $((NUM_KEYS / 10 / $NUM_THREADS))
+ run_db_bench "seekrandom"
+ run_db_bench "seekrandomwhilewriting"
+ fi
+
+ cleanup_test_directory $TEST_ROOT_DIR
+ echo ""
+ echo "Benchmark completed! Results are available in $RESULT_PATH"
+}
+
+############################################################################
+function init_arguments {
+ K=1024
+ M=$((1024 * K))
+ G=$((1024 * M))
+
+ current_time=$(date +"%F-%H:%M:%S")
+ RESULT_PATH=${RESULT_PATH:-"$1/results/$current_time"}
+ COMMIT_ID=`git log | head -n1 | cut -c 8-`
+ SUMMARY_FILE="$RESULT_PATH/SUMMARY.csv"
+
+ DB_PATH=${3:-"$1/db"}
+ ORIGIN_PATH=${ORIGIN_PATH:-"$(dirname $(dirname $DB_PATH))/db"}
+ WAL_PATH=${4:-""}
+ if [ -z "$REMOTE_USER_AT_HOST" ]; then
+ DB_BENCH_DIR=${5:-"."}
+ else
+ DB_BENCH_DIR=${5:-"$1/db_bench"}
+ fi
+
+ DEBUG=${DEBUG:-0}
+ TEST_MODE=${TEST_MODE:-1}
+ SCP=${SCP:-"scp"}
+ SSH=${SSH:-"ssh"}
+ NUM_THREADS=${NUM_THREADS:-16}
+ NUM_KEYS=${NUM_KEYS:-$((1 * G))} # key range
+ NUM_OPS=${NUM_OPS:-$(($NUM_KEYS / $NUM_THREADS))}
+ KEY_SIZE=${KEY_SIZE:-100}
+ VALUE_SIZE=${VALUE_SIZE:-900}
+ CACHE_SIZE=${CACHE_SIZE:-$((1 * G))}
+ STATISTICS=${STATISTICS:-0}
+ COMPRESSION_RATIO=${COMPRESSION_RATIO:-0.5}
+ HISTOGRAM=${HISTOGRAM:-1}
+ NUM_MULTI_DB=${NUM_MULTI_DB:-1}
+ STATS_PER_INTERVAL=${STATS_PER_INTERVAL:-1}
+ STATS_INTERVAL_SECONDS=${STATS_INTERVAL_SECONDS:-600}
+ MAX_BACKGROUND_FLUSHES=${MAX_BACKGROUND_FLUSHES:-4}
+ MAX_BACKGROUND_COMPACTIONS=${MAX_BACKGROUND_COMPACTIONS:-16}
+ NUM_HIGH_PRI_THREADS=${NUM_HIGH_PRI_THREADS:-4}
+ NUM_LOW_PRI_THREADS=${NUM_LOW_PRI_THREADS:-16}
+ DELETE_TEST_PATH=${DELETE_TEST_PATH:-0}
+ SEEK_NEXTS=${SEEK_NEXTS:-10}
+ SEED=${SEED:-$( date +%s )}
+}
+
+# $1 --- benchmark name
+# $2 --- number of operations. Default: $NUM_KEYS
+# $3 --- number of threads. Default $NUM_THREADS
+# $4 --- use_existing_db. Default: 1
+# $5 --- update_report. Default: 1
+function run_db_bench {
+ # this will terminate all currently-running db_bench
+ find_db_bench_cmd="ps aux | grep db_bench | grep -v grep | grep -v aux | awk '{print \$2}'"
+
+ ops=${2:-$NUM_OPS}
+ threads=${3:-$NUM_THREADS}
+ USE_EXISTING_DB=${4:-1}
+ UPDATE_REPORT=${5:-1}
+ echo ""
+ echo "======================================================================="
+ echo "Benchmark $1"
+ echo "======================================================================="
+ echo ""
+ db_bench_error=0
+ options_file_arg=$(setup_options_file)
+ echo "$options_file_arg"
+ # use `which time` to avoid using bash's internal time command
+ db_bench_cmd="("'\$(which time)'" -p $DB_BENCH_DIR/db_bench \
+ --benchmarks=$1 --db=$DB_PATH --wal_dir=$WAL_PATH \
+ --use_existing_db=$USE_EXISTING_DB \
+ --disable_auto_compactions \
+ --threads=$threads \
+ --num=$NUM_KEYS \
+ --reads=$ops \
+ --writes=$ops \
+ --deletes=$ops \
+ --key_size=$KEY_SIZE \
+ --value_size=$VALUE_SIZE \
+ --cache_size=$CACHE_SIZE \
+ --statistics=$STATISTICS \
+ $options_file_arg \
+ --compression_ratio=$COMPRESSION_RATIO \
+ --histogram=$HISTOGRAM \
+ --seek_nexts=$SEEK_NEXTS \
+ --stats_per_interval=$STATS_PER_INTERVAL \
+ --stats_interval_seconds=$STATS_INTERVAL_SECONDS \
+ --max_background_flushes=$MAX_BACKGROUND_FLUSHES \
+ --num_multi_db=$NUM_MULTI_DB \
+ --max_background_compactions=$MAX_BACKGROUND_COMPACTIONS \
+ --num_high_pri_threads=$NUM_HIGH_PRI_THREADS \
+ --num_low_pri_threads=$NUM_LOW_PRI_THREADS \
+ --seed=$SEED) 2>&1"
+ ps_cmd="ps aux"
+ if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+ echo "Running benchmark remotely on $REMOTE_USER_AT_HOST"
+ db_bench_cmd="$SSH $REMOTE_USER_AT_HOST \"$db_bench_cmd\""
+ ps_cmd="$SSH $REMOTE_USER_AT_HOST $ps_cmd"
+ fi
+
+ ## make sure no db_bench is running
+ # The following statement is necessary make sure "eval $ps_cmd" will success.
+ # Otherwise, if we simply check whether "$(eval $ps_cmd | grep db_bench)" is
+ # successful or not, then it will always be false since grep will return
+ # non-zero status when there's no matching output.
+ ps_output="$(eval $ps_cmd)"
+ exit_on_error $? "$ps_cmd"
+
+ # perform the actual command to check whether db_bench is running
+ grep_output="$(eval $ps_cmd | grep db_bench | grep -v grep)"
+ if [ "$grep_output" != "" ]; then
+ echo "Stopped regression_test.sh as there're still db_bench processes running:"
+ echo $grep_output
+ echo "Clean up test directory"
+ cleanup_test_directory $TEST_ROOT_DIR
+ exit 2
+ fi
+
+ ## run the db_bench
+ cmd="($db_bench_cmd || db_bench_error=1) | tee -a $RESULT_PATH/$1"
+ exit_on_error $?
+ echo $cmd
+ eval $cmd
+ exit_on_error $db_bench_error
+ if [ $UPDATE_REPORT -ne 0 ]; then
+ update_report "$1" "$RESULT_PATH/$1" $ops $threads
+ fi
+}
+
+function build_checkpoint {
+ cmd_prefix=""
+ if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+ cmd_prefix="$SSH $REMOTE_USER_AT_HOST "
+ fi
+ if [ $NUM_MULTI_DB -gt 1 ]; then
+ dirs=$($cmd_prefix find $ORIGIN_PATH -type d -links 2)
+ for dir in $dirs; do
+ db_index=$(basename $dir)
+ echo "Building checkpoints: $ORIGIN_PATH/$db_index -> $DB_PATH/$db_index ..."
+ $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH/$db_index \
+ --db=$ORIGIN_PATH/$db_index 2>&1
+ done
+ else
+ # checkpoint cannot build in directory already exists
+ $cmd_prefix rm -rf $DB_PATH
+ echo "Building checkpoint: $ORIGIN_PATH -> $DB_PATH ..."
+ $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH \
+ --db=$ORIGIN_PATH 2>&1
+ fi
+}
+
+function multiply {
+ echo "$1 * $2" | bc
+}
+
+# $1 --- name of the benchmark
+# $2 --- the filename of the output log of db_bench
+function update_report {
+ main_result=`cat $2 | grep $1`
+ exit_on_error $?
+ perc_statement=`cat $2 | grep Percentile`
+ exit_on_error $?
+
+ # Obtain micros / op
+
+ [[ $main_result =~ $MAIN_PATTERN ]]
+ ops_per_s=${BASH_REMATCH[1]}
+
+ # Obtain percentile information
+ [[ $perc_statement =~ $PERC_PATTERN ]]
+ perc[0]=${BASH_REMATCH[1]} # p50
+ perc[1]=${BASH_REMATCH[2]} # p75
+ perc[2]=${BASH_REMATCH[3]} # p99
+ perc[3]=${BASH_REMATCH[4]} # p99.9
+ perc[4]=${BASH_REMATCH[5]} # p99.99
+
+ # Parse the output of the time command
+ real_sec=`tail -3 $2 | grep real | awk '{print $2}'`
+ user_sec=`tail -3 $2 | grep user | awk '{print $2}'`
+ sys_sec=`tail -3 $2 | grep sys | awk '{print $2}'`
+
+ (printf "$DATA_FORMAT" \
+ $COMMIT_ID $1 $REMOTE_USER_AT_HOST $NUM_MULTI_DB $NUM_KEYS $KEY_SIZE $VALUE_SIZE \
+ $(multiply $COMPRESSION_RATIO 100) \
+ $3 $4 $CACHE_SIZE \
+ $MAX_BACKGROUND_FLUSHES $MAX_BACKGROUND_COMPACTIONS \
+ $ops_per_s \
+ $(multiply ${perc[0]} 1000) \
+ $(multiply ${perc[1]} 1000) \
+ $(multiply ${perc[2]} 1000) \
+ $(multiply ${perc[3]} 1000) \
+ $(multiply ${perc[4]} 1000) \
+ $DEBUG \
+ $real_sec \
+ $user_sec \
+ $sys_sec \
+ >> $SUMMARY_FILE)
+ exit_on_error $?
+}
+
+function exit_on_error {
+ if [ $1 -ne 0 ]; then
+ echo ""
+ echo "ERROR: Benchmark did not complete successfully."
+ if ! [ -z "$2" ]; then
+ echo "Failure command: $2"
+ fi
+ echo "Partial results are output to $RESULT_PATH"
+ echo "ERROR" >> $SUMMARY_FILE
+ exit $1
+ fi
+}
+
+function checkout_rocksdb {
+ echo "Checking out commit $1 ..."
+
+ git fetch --all
+ exit_on_error $?
+
+ git checkout $1
+ exit_on_error $?
+}
+
+function build_db_bench_and_ldb {
+ echo "Building db_bench & ldb ..."
+
+ make clean
+ exit_on_error $?
+
+ DEBUG_LEVEL=0 PORTABLE=1 make db_bench ldb -j32
+ exit_on_error $?
+}
+
+function run_remote {
+ test_remote "$1"
+ exit_on_error $? "$1"
+}
+
+function test_remote {
+ if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+ cmd="$SSH $REMOTE_USER_AT_HOST '$1'"
+ else
+ cmd="$1"
+ fi
+ eval "$cmd"
+}
+
+function run_local {
+ eval "$1"
+ exit_on_error $?
+}
+
+function setup_options_file {
+ if ! [ -z "$OPTIONS_FILE" ]; then
+ if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+ options_file="$DB_BENCH_DIR/OPTIONS_FILE"
+ run_local "$SCP $OPTIONS_FILE $REMOTE_USER_AT_HOST:$options_file"
+ else
+ options_file="$OPTIONS_FILE"
+ fi
+ echo "--options_file=$options_file"
+ fi
+ echo ""
+}
+
+function setup_test_directory {
+ echo "Deleting old regression test directories and creating new ones"
+
+ run_remote "rm -rf $DB_PATH"
+ run_remote "rm -rf $DB_BENCH_DIR"
+ run_local "rm -rf $RESULT_PATH"
+
+ if ! [ -z "$WAL_PATH" ]; then
+ run_remote "rm -rf $WAL_PATH"
+ run_remote "mkdir -p $WAL_PATH"
+ fi
+
+ run_remote "mkdir -p $DB_PATH"
+
+ run_remote "mkdir -p $DB_BENCH_DIR"
+ run_remote "ls -l $DB_BENCH_DIR"
+
+ if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+ run_local "$SCP ./db_bench $REMOTE_USER_AT_HOST:$DB_BENCH_DIR/db_bench"
+ run_local "$SCP ./ldb $REMOTE_USER_AT_HOST:$DB_BENCH_DIR/ldb"
+ fi
+
+ run_local "mkdir -p $RESULT_PATH"
+
+ (printf $TITLE_FORMAT \
+ "commit id" "benchmark" "user@host" "num-dbs" "key-range" "key-size" \
+ "value-size" "compress-rate" "ops-per-thread" "num-threads" "cache-size" \
+ "flushes" "compactions" \
+ "ops-per-s" "p50" "p75" "p99" "p99.9" "p99.99" "debug" \
+ "real-sec" "user-sec" "sys-sec" \
+ >> $SUMMARY_FILE)
+ exit_on_error $?
+}
+
+function cleanup_test_directory {
+
+ if [ $DELETE_TEST_PATH -ne 0 ]; then
+ echo "Clear old regression test directories and creating new ones"
+ run_remote "rm -rf $DB_PATH"
+ run_remote "rm -rf $WAL_PATH"
+ if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+ run_remote "rm -rf $DB_BENCH_DIR"
+ fi
+ run_remote "rm -rf $1"
+ else
+ echo "------------ DEBUG MODE ------------"
+ echo "DB PATH: $DB_PATH"
+ echo "WAL PATH: $WAL_PATH"
+ fi
+}
+
+############################################################################
+
+# shellcheck disable=SC2068
+main $@
diff --git a/src/rocksdb/tools/report_lite_binary_size.sh b/src/rocksdb/tools/report_lite_binary_size.sh
new file mode 100755
index 000000000..e3fe7bc26
--- /dev/null
+++ b/src/rocksdb/tools/report_lite_binary_size.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Script to report lite build binary size for latest RocksDB commits.
+# Usage:
+# ./report_lite_binary_size [num_recent_commits]
+
+num_recent_commits=${1:-10}
+
+echo "Computing RocksDB lite build binary size for the most recent $num_recent_commits commits."
+
+for ((i=0; i < num_recent_commits; i++))
+do
+ git checkout master~$i
+ commit_hash=$(git show -s --format=%H)
+ commit_time=$(git show -s --format=%ct)
+
+ # It would be nice to check if scuba already have a record for the commit,
+ # but sandcastle don't seems to have scuba CLI installed.
+
+ make clean
+ make OPT=-DROCKSDB_LITE static_lib
+
+ if make OPT=-DROCKSDB_LITE static_lib
+ then
+ build_succeeded='true'
+ strip librocksdb.a
+ binary_size=$(stat -c %s librocksdb.a)
+ else
+ build_succeeded='false'
+ binary_size=0
+ fi
+
+ current_time="\"time\": $(date +%s)"
+ commit_hash="\"hash\": \"$commit_hash\""
+ commit_time="\"commit_time\": $commit_time"
+ build_succeeded="\"build_succeeded\": \"$build_succeeded\""
+ binary_size="\"binary_size\": $binary_size"
+
+ scribe_log="{\"int\":{$current_time, $commit_time, $binary_size}, \"normal\":{$commit_hash, $build_succeeded}}"
+ echo "Logging to scribe: $scribe_log"
+ scribe_cat perfpipe_rocksdb_lite_build "$scribe_log"
+done
diff --git a/src/rocksdb/tools/rocksdb_dump_test.sh b/src/rocksdb/tools/rocksdb_dump_test.sh
new file mode 100755
index 000000000..532c53267
--- /dev/null
+++ b/src/rocksdb/tools/rocksdb_dump_test.sh
@@ -0,0 +1,9 @@
+# shellcheck disable=SC2148
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+TESTDIR=`mktemp -d ${TMPDIR:-/tmp}/rocksdb-dump-test.XXXXX`
+DUMPFILE="tools/sample-dump.dmp"
+
+# Verify that the sample dump file is undumpable and then redumpable.
+./rocksdb_undump --dump_location=$DUMPFILE --db_path=$TESTDIR/db
+./rocksdb_dump --anonymous --db_path=$TESTDIR/db --dump_location=$TESTDIR/dump
+cmp $DUMPFILE $TESTDIR/dump
diff --git a/src/rocksdb/tools/run_flash_bench.sh b/src/rocksdb/tools/run_flash_bench.sh
new file mode 100755
index 000000000..26e253843
--- /dev/null
+++ b/src/rocksdb/tools/run_flash_bench.sh
@@ -0,0 +1,359 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# REQUIRE: benchmark.sh exists in the current directory
+# After execution of this script, log files are generated in $output_dir.
+# report.txt provides a high level statistics
+
+# This should be run from the parent of the tools directory. The command line is:
+# [$env_vars] tools/run_flash_bench.sh [list-of-threads]
+#
+# This runs a sequence of tests in the following sequence:
+# step 1) load - bulkload, compact, fillseq, overwrite
+# step 2) read-only for each number of threads
+# step 3) read-write for each number of threads
+# step 4) merge for each number of threads
+#
+# The list of threads is optional and when not set is equivalent to "24".
+# Were list-of-threads specified as "1 2 4" then the tests in steps 2, 3 and
+# 4 above would be repeated for 1, 2 and 4 threads. The tests in step 1 are
+# only run for 1 thread.
+
+# Test output is written to $OUTPUT_DIR, currently /tmp/output. The performance
+# summary is in $OUTPUT_DIR/report.txt. There is one file in $OUTPUT_DIR per
+# test and the tests are listed below.
+#
+# The environment variables are also optional. The variables are:
+#
+# NKEYS - number of key/value pairs to load
+# BG_MBWRITEPERSEC - write rate limit in MB/second for tests in which
+# there is one thread doing writes and stats are
+# reported for read threads. "BG" stands for background.
+# If this is too large then the non-writer threads can get
+# starved. This is used for the "readwhile" tests.
+# FG_MBWRITEPERSEC - write rate limit in MB/second for tests like overwrite
+# where stats are reported for the write threads.
+# NSECONDS - number of seconds for which to run each test in steps 2,
+# 3 and 4. There are currently 15 tests in those steps and
+# they are repeated for each entry in list-of-threads so
+# this variable lets you control the total duration to
+# finish the benchmark.
+# RANGE_LIMIT - the number of rows to read per range query for tests that
+# do range queries.
+# VAL_SIZE - the length of the value in the key/value pairs loaded.
+# You can estimate the size of the test database from this,
+# NKEYS and the compression rate (--compression_ratio) set
+# in tools/benchmark.sh
+# BLOCK_LENGTH - value for db_bench --block_size
+# CACHE_BYTES - the size of the RocksDB block cache in bytes
+# DATA_DIR - directory in which to create database files
+# LOG_DIR - directory in which to create WAL files, may be the same
+# as DATA_DIR
+# DO_SETUP - when set to 0 then a backup of the database is copied from
+# $DATA_DIR.bak to $DATA_DIR and the load tests from step 1
+# The WAL directory is also copied from a backup if
+# DATA_DIR != LOG_DIR. This allows tests from steps 2, 3, 4
+# to be repeated faster.
+# SAVE_SETUP - saves a copy of the database at the end of step 1 to
+# $DATA_DIR.bak. When LOG_DIR != DATA_DIR then it is copied
+# to $LOG_DIR.bak.
+# SKIP_LOW_PRI_TESTS - skip some of the tests which aren't crucial for getting
+# actionable benchmarking data (look for keywords "bulkload",
+# "sync=1", and "while merging").
+#
+
+# Size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+
+num_keys=${NKEYS:-$((1 * G))}
+# write rate for readwhile... tests
+bg_mbwps=${BG_MBWRITEPERSEC:-4}
+# write rate for tests other than readwhile, 0 means no limit
+fg_mbwps=${FG_MBWRITEPERSEC:-0}
+duration=${NSECONDS:-$((60 * 60))}
+nps=${RANGE_LIMIT:-10}
+vs=${VAL_SIZE:-400}
+cs=${CACHE_BYTES:-$(( 1 * G ))}
+bs=${BLOCK_LENGTH:-8192}
+
+# If no command line arguments then run for 24 threads.
+if [[ $# -eq 0 ]]; then
+ nthreads=( 24 )
+else
+ nthreads=( "$@" )
+fi
+
+for num_thr in "${nthreads[@]}" ; do
+ echo Will run for $num_thr threads
+done
+
+# Update these parameters before execution !!!
+db_dir=${DATA_DIR:-"/tmp/rocksdb/"}
+wal_dir=${LOG_DIR:-"/tmp/rocksdb/"}
+
+do_setup=${DO_SETUP:-1}
+save_setup=${SAVE_SETUP:-0}
+
+# By default we'll run all the tests. Set this to skip a set of tests which
+# aren't critical for getting key metrics.
+skip_low_pri_tests=${SKIP_LOW_PRI_TESTS:-0}
+
+if [[ $skip_low_pri_tests == 1 ]]; then
+ echo "Skipping some non-critical tests because SKIP_LOW_PRI_TESTS is set."
+fi
+
+output_dir="${TMPDIR:-/tmp}/output"
+
+ARGS="\
+OUTPUT_DIR=$output_dir \
+NUM_KEYS=$num_keys \
+DB_DIR=$db_dir \
+WAL_DIR=$wal_dir \
+VALUE_SIZE=$vs \
+BLOCK_SIZE=$bs \
+CACHE_SIZE=$cs"
+
+mkdir -p $output_dir
+echo -e "ops/sec\tmb/sec\tSize-GB\tL0_GB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tp99.9\tp99.99\tUptime\tStall-time\tStall%\tTest" \
+ > $output_dir/report.txt
+
+# Notes on test sequence:
+# step 1) Setup database via sequential fill followed by overwrite to fragment it.
+# Done without setting DURATION to make sure that overwrite does $num_keys writes
+# step 2) read-only tests for all levels of concurrency requested
+# step 3) non read-only tests for all levels of concurrency requested
+# step 4) merge tests for all levels of concurrency requested. These must come last.
+
+###### Setup the database
+
+if [[ $do_setup != 0 ]]; then
+ echo Doing setup
+
+ if [[ $skip_low_pri_tests != 1 ]]; then
+ # Test 1: bulk load
+ env $ARGS ./tools/benchmark.sh bulkload
+ fi
+
+ # Test 2a: sequential fill with large values to get peak ingest
+ # adjust NUM_KEYS given the use of larger values
+ env $ARGS BLOCK_SIZE=$((1 * M)) VALUE_SIZE=$((32 * K)) NUM_KEYS=$(( num_keys / 64 )) \
+ ./tools/benchmark.sh fillseq_disable_wal
+
+ # Test 2b: sequential fill with the configured value size
+ env $ARGS ./tools/benchmark.sh fillseq_disable_wal
+
+ # Test 2c: same as 2a, but with WAL being enabled.
+ env $ARGS BLOCK_SIZE=$((1 * M)) VALUE_SIZE=$((32 * K)) NUM_KEYS=$(( num_keys / 64 )) \
+ ./tools/benchmark.sh fillseq_enable_wal
+
+ # Test 2d: same as 2b, but with WAL being enabled.
+ env $ARGS ./tools/benchmark.sh fillseq_enable_wal
+
+ # Test 3: single-threaded overwrite
+ env $ARGS NUM_THREADS=1 DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh overwrite
+
+else
+ echo Restoring from backup
+
+ rm -rf $db_dir
+
+ if [ ! -d ${db_dir}.bak ]; then
+ echo Database backup does not exist at ${db_dir}.bak
+ exit -1
+ fi
+
+ echo Restore database from ${db_dir}.bak
+ cp -p -r ${db_dir}.bak $db_dir
+
+ if [[ $db_dir != $wal_dir ]]; then
+ rm -rf $wal_dir
+
+ if [ ! -d ${wal_dir}.bak ]; then
+ echo WAL backup does not exist at ${wal_dir}.bak
+ exit -1
+ fi
+
+ echo Restore WAL from ${wal_dir}.bak
+ cp -p -r ${wal_dir}.bak $wal_dir
+ fi
+fi
+
+if [[ $save_setup != 0 ]]; then
+ echo Save database to ${db_dir}.bak
+ cp -p -r $db_dir ${db_dir}.bak
+
+ if [[ $db_dir != $wal_dir ]]; then
+ echo Save WAL to ${wal_dir}.bak
+ cp -p -r $wal_dir ${wal_dir}.bak
+ fi
+fi
+
+###### Read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+ # Test 4: random read
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh readrandom
+
+ # Test 5: random range scans
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr NUM_NEXTS_PER_SEEK=$nps \
+ ./tools/benchmark.sh fwdrange
+
+ # Test 6: random reverse range scans
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr NUM_NEXTS_PER_SEEK=$nps \
+ ./tools/benchmark.sh revrange
+done
+
+###### Non read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+ # Test 7: overwrite with sync=0
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \
+ DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh overwrite
+
+ if [[ $skip_low_pri_tests != 1 ]]; then
+ # Test 8: overwrite with sync=1
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \
+ ./tools/benchmark.sh overwrite
+ fi
+
+ # Test 9: random update with sync=0
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+ ./tools/benchmark.sh updaterandom
+
+ if [[ $skip_low_pri_tests != 1 ]]; then
+ # Test 10: random update with sync=1
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh updaterandom
+ fi
+
+ # Test 11: random read while writing
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \
+ DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh readwhilewriting
+
+ # Test 12: range scan while writing
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \
+ DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilewriting
+
+ # Test 13: reverse range scan while writing
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \
+ DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilewriting
+done
+
+###### Merge tests
+
+for num_thr in "${nthreads[@]}" ; do
+ # Test 14: random merge with sync=0
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \
+ DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh mergerandom
+
+ if [[ $skip_low_pri_tests != 1 ]]; then
+ # Test 15: random merge with sync=1
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \
+ ./tools/benchmark.sh mergerandom
+
+ # Test 16: random read while merging
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \
+ DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh readwhilemerging
+
+ # Test 17: range scan while merging
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \
+ DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilemerging
+
+ # Test 18: reverse range scan while merging
+ env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \
+ DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilemerging
+ fi
+done
+
+###### Universal compaction tests.
+
+# Use a single thread to reduce the variability in the benchmark.
+env $ARGS COMPACTION_TEST=1 NUM_THREADS=1 ./tools/benchmark.sh universal_compaction
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+ echo bulkload > $output_dir/report2.txt
+ head -1 $output_dir/report.txt >> $output_dir/report2.txt
+ grep bulkload $output_dir/report.txt >> $output_dir/report2.txt
+fi
+
+echo fillseq_wal_disabled >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fillseq.wal_disabled $output_dir/report.txt >> $output_dir/report2.txt
+
+echo fillseq_wal_enabled >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fillseq.wal_enabled $output_dir/report.txt >> $output_dir/report2.txt
+
+echo overwrite sync=0 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep overwrite $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+ echo overwrite sync=1 >> $output_dir/report2.txt
+ head -1 $output_dir/report.txt >> $output_dir/report2.txt
+ grep overwrite $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt
+fi
+
+echo updaterandom sync=0 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep updaterandom $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+ echo updaterandom sync=1 >> $output_dir/report2.txt
+ head -1 $output_dir/report.txt >> $output_dir/report2.txt
+ grep updaterandom $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt
+fi
+
+echo mergerandom sync=0 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep mergerandom $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+ echo mergerandom sync=1 >> $output_dir/report2.txt
+ head -1 $output_dir/report.txt >> $output_dir/report2.txt
+ grep mergerandom $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt
+fi
+
+echo readrandom >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep readrandom $output_dir/report.txt >> $output_dir/report2.txt
+
+echo fwdrange >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fwdrange\.t $output_dir/report.txt >> $output_dir/report2.txt
+
+echo revrange >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep revrange\.t $output_dir/report.txt >> $output_dir/report2.txt
+
+echo readwhile >> $output_dir/report2.txt >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep readwhilewriting $output_dir/report.txt >> $output_dir/report2.txt
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+ echo readwhile >> $output_dir/report2.txt
+ head -1 $output_dir/report.txt >> $output_dir/report2.txt
+ grep readwhilemerging $output_dir/report.txt >> $output_dir/report2.txt
+fi
+
+echo fwdreadwhilewriting >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fwdrangewhilewriting $output_dir/report.txt >> $output_dir/report2.txt
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+ echo fwdreadwhilemerging >> $output_dir/report2.txt
+ head -1 $output_dir/report.txt >> $output_dir/report2.txt
+ grep fwdrangewhilemerg $output_dir/report.txt >> $output_dir/report2.txt
+fi
+
+echo revreadwhilewriting >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep revrangewhilewriting $output_dir/report.txt >> $output_dir/report2.txt
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+ echo revreadwhilemerging >> $output_dir/report2.txt
+ head -1 $output_dir/report.txt >> $output_dir/report2.txt
+ grep revrangewhilemerg $output_dir/report.txt >> $output_dir/report2.txt
+fi
+
+cat $output_dir/report2.txt
diff --git a/src/rocksdb/tools/run_leveldb.sh b/src/rocksdb/tools/run_leveldb.sh
new file mode 100755
index 000000000..2fce8b12d
--- /dev/null
+++ b/src/rocksdb/tools/run_leveldb.sh
@@ -0,0 +1,175 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# REQUIRE: benchmark_leveldb.sh exists in the current directory
+# After execution of this script, log files are generated in $output_dir.
+# report.txt provides a high level statistics
+#
+# This should be used with the LevelDB fork listed here to use additional test options.
+# For more details on the changes see the blog post listed below.
+# https://github.com/mdcallag/leveldb-1
+# http://smalldatum.blogspot.com/2015/04/comparing-leveldb-and-rocksdb-take-2.html
+#
+# This should be run from the parent of the tools directory. The command line is:
+# [$env_vars] tools/run_flash_bench.sh [list-of-threads]
+#
+# This runs a sequence of tests in the following sequence:
+# step 1) load - bulkload, compact, fillseq, overwrite
+# step 2) read-only for each number of threads
+# step 3) read-write for each number of threads
+#
+# The list of threads is optional and when not set is equivalent to "24".
+# Were list-of-threads specified as "1 2 4" then the tests in steps 2, 3 and
+# 4 above would be repeated for 1, 2 and 4 threads. The tests in step 1 are
+# only run for 1 thread.
+
+# Test output is written to $OUTPUT_DIR, currently /tmp/output. The performance
+# summary is in $OUTPUT_DIR/report.txt. There is one file in $OUTPUT_DIR per
+# test and the tests are listed below.
+#
+# The environment variables are also optional. The variables are:
+# NKEYS - number of key/value pairs to load
+# NWRITESPERSEC - the writes/second rate limit for the *whilewriting* tests.
+# If this is too large then the non-writer threads can get
+# starved.
+# VAL_SIZE - the length of the value in the key/value pairs loaded.
+# You can estimate the size of the test database from this,
+# NKEYS and the compression rate (--compression_ratio) set
+# in tools/benchmark_leveldb.sh
+# BLOCK_LENGTH - value for db_bench --block_size
+# CACHE_BYTES - the size of the RocksDB block cache in bytes
+# DATA_DIR - directory in which to create database files
+# DO_SETUP - when set to 0 then a backup of the database is copied from
+# $DATA_DIR.bak to $DATA_DIR and the load tests from step 1
+# This allows tests from steps 2, 3 to be repeated faster.
+# SAVE_SETUP - saves a copy of the database at the end of step 1 to
+# $DATA_DIR.bak.
+
+# Size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+
+num_keys=${NKEYS:-$((1 * G))}
+wps=${NWRITESPERSEC:-$((10 * K))}
+vs=${VAL_SIZE:-400}
+cs=${CACHE_BYTES:-$(( 1 * G ))}
+bs=${BLOCK_LENGTH:-4096}
+
+# If no command line arguments then run for 24 threads.
+if [[ $# -eq 0 ]]; then
+ nthreads=( 24 )
+else
+ nthreads=( "$@" )
+fi
+
+for num_thr in "${nthreads[@]}" ; do
+ echo Will run for $num_thr threads
+done
+
+# Update these parameters before execution !!!
+db_dir=${DATA_DIR:-"/tmp/rocksdb/"}
+
+do_setup=${DO_SETUP:-1}
+save_setup=${SAVE_SETUP:-0}
+
+output_dir="${TMPDIR:-/tmp}/output"
+
+ARGS="\
+OUTPUT_DIR=$output_dir \
+NUM_KEYS=$num_keys \
+DB_DIR=$db_dir \
+VALUE_SIZE=$vs \
+BLOCK_SIZE=$bs \
+CACHE_SIZE=$cs"
+
+mkdir -p $output_dir
+echo -e "ops/sec\tmb/sec\tusec/op\tavg\tp50\tTest" \
+ > $output_dir/report.txt
+
+# Notes on test sequence:
+# step 1) Setup database via sequential fill followed by overwrite to fragment it.
+# Done without setting DURATION to make sure that overwrite does $num_keys writes
+# step 2) read-only tests for all levels of concurrency requested
+# step 3) non read-only tests for all levels of concurrency requested
+
+###### Setup the database
+
+if [[ $do_setup != 0 ]]; then
+ echo Doing setup
+
+ # Test 2a: sequential fill with large values to get peak ingest
+ # adjust NUM_KEYS given the use of larger values
+ env $ARGS BLOCK_SIZE=$((1 * M)) VALUE_SIZE=$((32 * K)) NUM_KEYS=$(( num_keys / 64 )) \
+ ./tools/benchmark_leveldb.sh fillseq
+
+ # Test 2b: sequential fill with the configured value size
+ env $ARGS ./tools/benchmark_leveldb.sh fillseq
+
+ # Test 3: single-threaded overwrite
+ env $ARGS NUM_THREADS=1 DB_BENCH_NO_SYNC=1 ./tools/benchmark_leveldb.sh overwrite
+
+else
+ echo Restoring from backup
+
+ rm -rf $db_dir
+
+ if [ ! -d ${db_dir}.bak ]; then
+ echo Database backup does not exist at ${db_dir}.bak
+ exit -1
+ fi
+
+ echo Restore database from ${db_dir}.bak
+ cp -p -r ${db_dir}.bak $db_dir
+fi
+
+if [[ $save_setup != 0 ]]; then
+ echo Save database to ${db_dir}.bak
+ cp -p -r $db_dir ${db_dir}.bak
+fi
+
+###### Read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+ # Test 4: random read
+ env $ARGS NUM_THREADS=$num_thr ./tools/benchmark_leveldb.sh readrandom
+
+done
+
+###### Non read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+ # Test 7: overwrite with sync=0
+ env $ARGS NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+ ./tools/benchmark_leveldb.sh overwrite
+
+ # Test 8: overwrite with sync=1
+ # Not run for now because LevelDB db_bench doesn't have an option to limit the
+ # test run to X seconds and doing sync-per-commit for --num can take too long.
+ # env $ARGS NUM_THREADS=$num_thr ./tools/benchmark_leveldb.sh overwrite
+
+ # Test 11: random read while writing
+ env $ARGS NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+ ./tools/benchmark_leveldb.sh readwhilewriting
+
+done
+
+echo bulkload > $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep bulkload $output_dir/report.txt >> $output_dir/report2.txt
+echo fillseq >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fillseq $output_dir/report.txt >> $output_dir/report2.txt
+echo overwrite sync=0 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep overwrite $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt
+echo overwrite sync=1 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep overwrite $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt
+echo readrandom >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep readrandom $output_dir/report.txt >> $output_dir/report2.txt
+echo readwhile >> $output_dir/report2.txt >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep readwhilewriting $output_dir/report.txt >> $output_dir/report2.txt
+
+cat $output_dir/report2.txt
diff --git a/src/rocksdb/tools/sample-dump.dmp b/src/rocksdb/tools/sample-dump.dmp
new file mode 100644
index 000000000..4ec3a7732
--- /dev/null
+++ b/src/rocksdb/tools/sample-dump.dmp
Binary files differ
diff --git a/src/rocksdb/tools/sst_dump.cc b/src/rocksdb/tools/sst_dump.cc
new file mode 100644
index 000000000..1a0258c6d
--- /dev/null
+++ b/src/rocksdb/tools/sst_dump.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/sst_dump_tool.h"
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::SSTDumpTool tool;
+ tool.Run(argc, argv);
+ return 0;
+}
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "Not supported in lite mode.\n");
+ return 1;
+}
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/sst_dump_test.cc b/src/rocksdb/tools/sst_dump_test.cc
new file mode 100644
index 000000000..b8c475c3b
--- /dev/null
+++ b/src/rocksdb/tools/sst_dump_test.cc
@@ -0,0 +1,282 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <stdint.h>
+#include "rocksdb/sst_dump_tool.h"
+
+#include "file/random_access_file_reader.h"
+#include "port/stack_trace.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint32_t optLength = 100;
+
+namespace {
+static std::string MakeKey(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "k_%04d", i);
+ InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
+ return key.Encode().ToString();
+}
+
+static std::string MakeValue(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "v_%04d", i);
+ InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
+ return key.Encode().ToString();
+}
+
+void createSST(const Options& opts, const std::string& file_name) {
+ Env* env = opts.env;
+ EnvOptions env_options(opts);
+ ReadOptions read_options;
+ const ImmutableCFOptions imoptions(opts);
+ const MutableCFOptions moptions(opts);
+ ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
+ std::unique_ptr<TableBuilder> tb;
+
+ std::unique_ptr<WritableFile> file;
+ ASSERT_OK(env->NewWritableFile(file_name, &file, env_options));
+
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
+ int_tbl_prop_collector_factories;
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(file)), file_name, EnvOptions()));
+ std::string column_family_name;
+ int unknown_level = -1;
+ tb.reset(opts.table_factory->NewTableBuilder(
+ TableBuilderOptions(
+ imoptions, moptions, ikc, &int_tbl_prop_collector_factories,
+ CompressionType::kNoCompression, 0 /* sample_for_compression */,
+ CompressionOptions(), false /* skip_filters */, column_family_name,
+ unknown_level),
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+ file_writer.get()));
+
+ // Populate slightly more than 1K keys
+ uint32_t num_keys = 1024;
+ for (uint32_t i = 0; i < num_keys; i++) {
+ tb->Add(MakeKey(i), MakeValue(i));
+ }
+ tb->Finish();
+ file_writer->Close();
+}
+
+void cleanup(const Options& opts, const std::string& file_name) {
+ Env* env = opts.env;
+ env->DeleteFile(file_name);
+ std::string outfile_name = file_name.substr(0, file_name.length() - 4);
+ outfile_name.append("_dump.txt");
+ env->DeleteFile(outfile_name);
+}
+} // namespace
+
+// Test for sst dump tool "raw" mode
+class SSTDumpToolTest : public testing::Test {
+ std::string test_dir_;
+ Env* env_;
+ std::shared_ptr<Env> env_guard_;
+
+ public:
+ SSTDumpToolTest() : env_(Env::Default()) {
+ const char* test_env_uri = getenv("TEST_ENV_URI");
+ if (test_env_uri) {
+ Env::LoadEnv(test_env_uri, &env_, &env_guard_);
+ }
+ test_dir_ = test::PerThreadDBPath(env_, "sst_dump_test_db");
+ Status s = env_->CreateDirIfMissing(test_dir_);
+ EXPECT_OK(s);
+ }
+
+ ~SSTDumpToolTest() override {
+ if (getenv("KEEP_DB")) {
+ fprintf(stdout, "Data is still at %s\n", test_dir_.c_str());
+ } else {
+ EXPECT_OK(env_->DeleteDir(test_dir_));
+ }
+ }
+
+ Env* env() { return env_; }
+
+ std::string MakeFilePath(const std::string& file_name) const {
+ std::string path(test_dir_);
+ path.append("/").append(file_name);
+ return path;
+ }
+
+ template <std::size_t N>
+ void PopulateCommandArgs(const std::string& file_path, const char* command,
+ char* (&usage)[N]) const {
+ for (int i = 0; i < static_cast<int>(N); ++i) {
+ usage[i] = new char[optLength];
+ }
+ snprintf(usage[0], optLength, "./sst_dump");
+ snprintf(usage[1], optLength, "%s", command);
+ snprintf(usage[2], optLength, "--file=%s", file_path.c_str());
+ }
+};
+
+TEST_F(SSTDumpToolTest, EmptyFilter) {
+ Options opts;
+ opts.env = env();
+ std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+ createSST(opts, file_path);
+
+ char* usage[3];
+ PopulateCommandArgs(file_path, "--command=raw", usage);
+
+ ROCKSDB_NAMESPACE::SSTDumpTool tool;
+ ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+ cleanup(opts, file_path);
+ for (int i = 0; i < 3; i++) {
+ delete[] usage[i];
+ }
+}
+
+TEST_F(SSTDumpToolTest, FilterBlock) {
+ Options opts;
+ opts.env = env();
+ BlockBasedTableOptions table_opts;
+ table_opts.filter_policy.reset(
+ ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, true));
+ opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+ std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+ createSST(opts, file_path);
+
+ char* usage[3];
+ PopulateCommandArgs(file_path, "--command=raw", usage);
+
+ ROCKSDB_NAMESPACE::SSTDumpTool tool;
+ ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+ cleanup(opts, file_path);
+ for (int i = 0; i < 3; i++) {
+ delete[] usage[i];
+ }
+}
+
+TEST_F(SSTDumpToolTest, FullFilterBlock) {
+ Options opts;
+ opts.env = env();
+ BlockBasedTableOptions table_opts;
+ table_opts.filter_policy.reset(
+ ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
+ opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+ std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+ createSST(opts, file_path);
+
+ char* usage[3];
+ PopulateCommandArgs(file_path, "--command=raw", usage);
+
+ ROCKSDB_NAMESPACE::SSTDumpTool tool;
+ ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+ cleanup(opts, file_path);
+ for (int i = 0; i < 3; i++) {
+ delete[] usage[i];
+ }
+}
+
+TEST_F(SSTDumpToolTest, GetProperties) {
+ Options opts;
+ opts.env = env();
+ BlockBasedTableOptions table_opts;
+ table_opts.filter_policy.reset(
+ ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
+ opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+ std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+ createSST(opts, file_path);
+
+ char* usage[3];
+ PopulateCommandArgs(file_path, "--show_properties", usage);
+
+ ROCKSDB_NAMESPACE::SSTDumpTool tool;
+ ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+ cleanup(opts, file_path);
+ for (int i = 0; i < 3; i++) {
+ delete[] usage[i];
+ }
+}
+
+TEST_F(SSTDumpToolTest, CompressedSizes) {
+ Options opts;
+ opts.env = env();
+ BlockBasedTableOptions table_opts;
+ table_opts.filter_policy.reset(
+ ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
+ opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+ std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+ createSST(opts, file_path);
+
+ char* usage[3];
+ PopulateCommandArgs(file_path, "--command=recompress", usage);
+
+ ROCKSDB_NAMESPACE::SSTDumpTool tool;
+ ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+ cleanup(opts, file_path);
+ for (int i = 0; i < 3; i++) {
+ delete[] usage[i];
+ }
+}
+
+TEST_F(SSTDumpToolTest, MemEnv) {
+ std::unique_ptr<Env> mem_env(NewMemEnv(env()));
+ Options opts;
+ opts.env = mem_env.get();
+ std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+ createSST(opts, file_path);
+
+ char* usage[3];
+ PopulateCommandArgs(file_path, "--command=verify_checksum", usage);
+
+ ROCKSDB_NAMESPACE::SSTDumpTool tool;
+ ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+ cleanup(opts, file_path);
+ for (int i = 0; i < 3; i++) {
+ delete[] usage[i];
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as SSTDumpTool is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE return RUN_ALL_TESTS();
diff --git a/src/rocksdb/tools/sst_dump_tool.cc b/src/rocksdb/tools/sst_dump_tool.cc
new file mode 100644
index 000000000..be4dc961d
--- /dev/null
+++ b/src/rocksdb/tools/sst_dump_tool.cc
@@ -0,0 +1,778 @@
+
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "tools/sst_dump_tool_imp.h"
+
+#include <cinttypes>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#include "db/blob_index.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "env/composite_env_wrapper.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_reader.h"
+#include "util/compression.h"
+#include "util/random.h"
+
+#include "port/port.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+SstFileDumper::SstFileDumper(const Options& options,
+ const std::string& file_path, bool verify_checksum,
+ bool output_hex, bool decode_blob_index)
+ : file_name_(file_path),
+ read_num_(0),
+ verify_checksum_(verify_checksum),
+ output_hex_(output_hex),
+ decode_blob_index_(decode_blob_index),
+ options_(options),
+ ioptions_(options_),
+ moptions_(ColumnFamilyOptions(options_)),
+ internal_comparator_(BytewiseComparator()) {
+ fprintf(stdout, "Process %s\n", file_path.c_str());
+ init_result_ = GetTableReader(file_name_);
+}
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+
+const char* testFileName = "test_file_name";
+
+static const std::vector<std::pair<CompressionType, const char*>>
+ kCompressions = {
+ {CompressionType::kNoCompression, "kNoCompression"},
+ {CompressionType::kSnappyCompression, "kSnappyCompression"},
+ {CompressionType::kZlibCompression, "kZlibCompression"},
+ {CompressionType::kBZip2Compression, "kBZip2Compression"},
+ {CompressionType::kLZ4Compression, "kLZ4Compression"},
+ {CompressionType::kLZ4HCCompression, "kLZ4HCCompression"},
+ {CompressionType::kXpressCompression, "kXpressCompression"},
+ {CompressionType::kZSTD, "kZSTD"}};
+
+Status SstFileDumper::GetTableReader(const std::string& file_path) {
+ // Warning about 'magic_number' being uninitialized shows up only in UBsan
+ // builds. Though access is guarded by 's.ok()' checks, fix the issue to
+ // avoid any warnings.
+ uint64_t magic_number = Footer::kInvalidTableMagicNumber;
+
+ // read table magic number
+ Footer footer;
+
+ std::unique_ptr<RandomAccessFile> file;
+ uint64_t file_size = 0;
+ Status s = options_.env->NewRandomAccessFile(file_path, &file, soptions_);
+ if (s.ok()) {
+ s = options_.env->GetFileSize(file_path, &file_size);
+ }
+
+ file_.reset(new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file),
+ file_path));
+
+ if (s.ok()) {
+ s = ReadFooterFromFile(file_.get(), nullptr /* prefetch_buffer */,
+ file_size, &footer);
+ }
+ if (s.ok()) {
+ magic_number = footer.table_magic_number();
+ }
+
+ if (s.ok()) {
+ if (magic_number == kPlainTableMagicNumber ||
+ magic_number == kLegacyPlainTableMagicNumber) {
+ soptions_.use_mmap_reads = true;
+ options_.env->NewRandomAccessFile(file_path, &file, soptions_);
+ file_.reset(new RandomAccessFileReader(
+ NewLegacyRandomAccessFileWrapper(file), file_path));
+ }
+ options_.comparator = &internal_comparator_;
+ // For old sst format, ReadTableProperties might fail but file can be read
+ if (ReadTableProperties(magic_number, file_.get(), file_size).ok()) {
+ SetTableOptionsByMagicNumber(magic_number);
+ } else {
+ SetOldTableOptions();
+ }
+ }
+
+ if (s.ok()) {
+ s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size,
+ &table_reader_);
+ }
+ return s;
+}
+
+Status SstFileDumper::NewTableReader(
+ const ImmutableCFOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
+ const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
+ std::unique_ptr<TableReader>* /*table_reader*/) {
+ // We need to turn off pre-fetching of index and filter nodes for
+ // BlockBasedTable
+ if (BlockBasedTableFactory::kName == options_.table_factory->Name()) {
+ return options_.table_factory->NewTableReader(
+ TableReaderOptions(ioptions_, moptions_.prefix_extractor.get(),
+ soptions_, internal_comparator_),
+ std::move(file_), file_size, &table_reader_, /*enable_prefetch=*/false);
+ }
+
+ // For all other factory implementation
+ return options_.table_factory->NewTableReader(
+ TableReaderOptions(ioptions_, moptions_.prefix_extractor.get(), soptions_,
+ internal_comparator_),
+ std::move(file_), file_size, &table_reader_);
+}
+
+Status SstFileDumper::VerifyChecksum() {
+ // We could pass specific readahead setting into read options if needed.
+ return table_reader_->VerifyChecksum(ReadOptions(),
+ TableReaderCaller::kSSTDumpTool);
+}
+
+Status SstFileDumper::DumpTable(const std::string& out_filename) {
+ std::unique_ptr<WritableFile> out_file;
+ Env* env = options_.env;
+ env->NewWritableFile(out_filename, &out_file, soptions_);
+ Status s = table_reader_->DumpTable(out_file.get());
+ out_file->Close();
+ return s;
+}
+
+uint64_t SstFileDumper::CalculateCompressedTableSize(
+ const TableBuilderOptions& tb_options, size_t block_size,
+ uint64_t* num_data_blocks) {
+ std::unique_ptr<WritableFile> out_file;
+ std::unique_ptr<Env> env(NewMemEnv(options_.env));
+ env->NewWritableFile(testFileName, &out_file, soptions_);
+ std::unique_ptr<WritableFileWriter> dest_writer;
+ dest_writer.reset(
+ new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(out_file)),
+ testFileName, soptions_));
+ BlockBasedTableOptions table_options;
+ table_options.block_size = block_size;
+ BlockBasedTableFactory block_based_tf(table_options);
+ std::unique_ptr<TableBuilder> table_builder;
+ table_builder.reset(block_based_tf.NewTableBuilder(
+ tb_options,
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+ dest_writer.get()));
+ std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
+ ReadOptions(), moptions_.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ if (!iter->status().ok()) {
+ fputs(iter->status().ToString().c_str(), stderr);
+ exit(1);
+ }
+ table_builder->Add(iter->key(), iter->value());
+ }
+ Status s = table_builder->Finish();
+ if (!s.ok()) {
+ fputs(s.ToString().c_str(), stderr);
+ exit(1);
+ }
+ uint64_t size = table_builder->FileSize();
+ assert(num_data_blocks != nullptr);
+ *num_data_blocks = table_builder->GetTableProperties().num_data_blocks;
+ env->DeleteFile(testFileName);
+ return size;
+}
+
+int SstFileDumper::ShowAllCompressionSizes(
+ size_t block_size,
+ const std::vector<std::pair<CompressionType, const char*>>&
+ compression_types) {
+ ReadOptions read_options;
+ Options opts;
+ opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ opts.statistics->set_stats_level(StatsLevel::kAll);
+ const ImmutableCFOptions imoptions(opts);
+ const ColumnFamilyOptions cfo(opts);
+ const MutableCFOptions moptions(cfo);
+ ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
+ block_based_table_factories;
+
+ fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
+
+ for (auto& i : compression_types) {
+ if (CompressionTypeSupported(i.first)) {
+ CompressionOptions compress_opt;
+ std::string column_family_name;
+ int unknown_level = -1;
+ TableBuilderOptions tb_opts(
+ imoptions, moptions, ikc, &block_based_table_factories, i.first,
+ 0 /* sample_for_compression */, compress_opt,
+ false /* skip_filters */, column_family_name, unknown_level);
+ uint64_t num_data_blocks = 0;
+ uint64_t file_size =
+ CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks);
+ fprintf(stdout, "Compression: %-24s", i.second);
+ fprintf(stdout, " Size: %10" PRIu64, file_size);
+ fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks);
+ const uint64_t compressed_blocks =
+ opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED);
+ const uint64_t not_compressed_blocks =
+ opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_NOT_COMPRESSED);
+ // When the option enable_index_compression is true,
+ // NUMBER_BLOCK_COMPRESSED is incremented for index block(s).
+ if ((compressed_blocks + not_compressed_blocks) > num_data_blocks) {
+ num_data_blocks = compressed_blocks + not_compressed_blocks;
+ }
+ const uint64_t ratio_not_compressed_blocks =
+ (num_data_blocks - compressed_blocks) - not_compressed_blocks;
+ const double compressed_pcnt =
+ (0 == num_data_blocks) ? 0.0
+ : ((static_cast<double>(compressed_blocks) /
+ static_cast<double>(num_data_blocks)) *
+ 100.0);
+ const double ratio_not_compressed_pcnt =
+ (0 == num_data_blocks)
+ ? 0.0
+ : ((static_cast<double>(ratio_not_compressed_blocks) /
+ static_cast<double>(num_data_blocks)) *
+ 100.0);
+ const double not_compressed_pcnt =
+ (0 == num_data_blocks)
+ ? 0.0
+ : ((static_cast<double>(not_compressed_blocks) /
+ static_cast<double>(num_data_blocks)) *
+ 100.0);
+ fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
+ compressed_pcnt);
+ fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)",
+ ratio_not_compressed_blocks, ratio_not_compressed_pcnt);
+ fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n",
+ not_compressed_blocks, not_compressed_pcnt);
+ } else {
+ fprintf(stdout, "Unsupported compression type: %s.\n", i.second);
+ }
+ }
+ return 0;
+}
+Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
+ RandomAccessFileReader* file,
+ uint64_t file_size) {
+ TableProperties* table_properties = nullptr;
+ Status s = ROCKSDB_NAMESPACE::ReadTableProperties(
+ file, file_size, table_magic_number, ioptions_, &table_properties);
+ if (s.ok()) {
+ table_properties_.reset(table_properties);
+ } else {
+ fprintf(stdout, "Not able to read table properties\n");
+ }
+ return s;
+}
+
+Status SstFileDumper::SetTableOptionsByMagicNumber(
+ uint64_t table_magic_number) {
+ assert(table_properties_);
+ if (table_magic_number == kBlockBasedTableMagicNumber ||
+ table_magic_number == kLegacyBlockBasedTableMagicNumber) {
+ options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+ fprintf(stdout, "Sst file format: block-based\n");
+ auto& props = table_properties_->user_collected_properties;
+ auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+ if (pos != props.end()) {
+ auto index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
+ DecodeFixed32(pos->second.c_str()));
+ if (index_type_on_file ==
+ BlockBasedTableOptions::IndexType::kHashSearch) {
+ options_.prefix_extractor.reset(NewNoopTransform());
+ }
+ }
+ } else if (table_magic_number == kPlainTableMagicNumber ||
+ table_magic_number == kLegacyPlainTableMagicNumber) {
+ options_.allow_mmap_reads = true;
+
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = kPlainTableVariableLength;
+ plain_table_options.bloom_bits_per_key = 0;
+ plain_table_options.hash_table_ratio = 0;
+ plain_table_options.index_sparseness = 1;
+ plain_table_options.huge_page_tlb_size = 0;
+ plain_table_options.encoding_type = kPlain;
+ plain_table_options.full_scan_mode = true;
+
+ options_.table_factory.reset(NewPlainTableFactory(plain_table_options));
+ fprintf(stdout, "Sst file format: plain table\n");
+ } else {
+ char error_msg_buffer[80];
+ snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1,
+ "Unsupported table magic number --- %lx",
+ (long)table_magic_number);
+ return Status::InvalidArgument(error_msg_buffer);
+ }
+
+ return Status::OK();
+}
+
+Status SstFileDumper::SetOldTableOptions() {
+ assert(table_properties_ == nullptr);
+ options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+ fprintf(stdout, "Sst file format: block-based(old version)\n");
+
+ return Status::OK();
+}
+
+Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
+ bool has_from, const std::string& from_key,
+ bool has_to, const std::string& to_key,
+ bool use_from_as_prefix) {
+ if (!table_reader_) {
+ return init_result_;
+ }
+
+ InternalIterator* iter = table_reader_->NewIterator(
+ ReadOptions(verify_checksum_, false), moptions_.prefix_extractor.get(),
+ /*arena=*/nullptr, /*skip_filters=*/false,
+ TableReaderCaller::kSSTDumpTool);
+ uint64_t i = 0;
+ if (has_from) {
+ InternalKey ikey;
+ ikey.SetMinPossibleForUserKey(from_key);
+ iter->Seek(ikey.Encode());
+ } else {
+ iter->SeekToFirst();
+ }
+ for (; iter->Valid(); iter->Next()) {
+ Slice key = iter->key();
+ Slice value = iter->value();
+ ++i;
+ if (read_num > 0 && i > read_num)
+ break;
+
+ ParsedInternalKey ikey;
+ if (!ParseInternalKey(key, &ikey)) {
+ std::cerr << "Internal Key ["
+ << key.ToString(true /* in hex*/)
+ << "] parse error!\n";
+ continue;
+ }
+
+ // the key returned is not prefixed with out 'from' key
+ if (use_from_as_prefix && !ikey.user_key.starts_with(from_key)) {
+ break;
+ }
+
+ // If end marker was specified, we stop before it
+ if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) {
+ break;
+ }
+
+ if (print_kv) {
+ if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) {
+ fprintf(stdout, "%s => %s\n", ikey.DebugString(output_hex_).c_str(),
+ value.ToString(output_hex_).c_str());
+ } else {
+ BlobIndex blob_index;
+
+ const Status s = blob_index.DecodeFrom(value);
+ if (!s.ok()) {
+ fprintf(stderr, "%s => error decoding blob index\n",
+ ikey.DebugString(output_hex_).c_str());
+ continue;
+ }
+
+ fprintf(stdout, "%s => %s\n", ikey.DebugString(output_hex_).c_str(),
+ blob_index.DebugString(output_hex_).c_str());
+ }
+ }
+ }
+
+ read_num_ += i;
+
+ Status ret = iter->status();
+ delete iter;
+ return ret;
+}
+
+Status SstFileDumper::ReadTableProperties(
+ std::shared_ptr<const TableProperties>* table_properties) {
+ if (!table_reader_) {
+ return init_result_;
+ }
+
+ *table_properties = table_reader_->GetTableProperties();
+ return init_result_;
+}
+
+namespace {
+
+void print_help() {
+ fprintf(
+ stderr,
+ R"(sst_dump --file=<data_dir_OR_sst_file> [--command=check|scan|raw|recompress]
+ --file=<data_dir_OR_sst_file>
+ Path to SST file or directory containing SST files
+
+ --env_uri=<uri of underlying Env>
+ URI of underlying Env
+
+ --command=check|scan|raw|verify
+ check: Iterate over entries in files but don't print anything except if an error is encountered (default command)
+ scan: Iterate over entries in files and print them to screen
+ raw: Dump all the table contents to <file_name>_dump.txt
+ verify: Iterate all the blocks in files verifying checksum to detect possible corruption but don't print anything except if a corruption is encountered
+ recompress: reports the SST file size if recompressed with different
+ compression types
+
+ --output_hex
+ Can be combined with scan command to print the keys and values in Hex
+
+ --decode_blob_index
+ Decode blob indexes and print them in a human-readable format during scans.
+
+ --from=<user_key>
+ Key to start reading from when executing check|scan
+
+ --to=<user_key>
+ Key to stop reading at when executing check|scan
+
+ --prefix=<user_key>
+ Returns all keys with this prefix when executing check|scan
+ Cannot be used in conjunction with --from
+
+ --read_num=<num>
+ Maximum number of entries to read when executing check|scan
+
+ --verify_checksum
+ Verify file checksum when executing check|scan
+
+ --input_key_hex
+ Can be combined with --from and --to to indicate that these values are encoded in Hex
+
+ --show_properties
+ Print table properties after iterating over the file when executing
+ check|scan|raw
+
+ --set_block_size=<block_size>
+ Can be combined with --command=recompress to set the block size that will
+ be used when trying different compression algorithms
+
+ --compression_types=<comma-separated list of CompressionType members, e.g.,
+ kSnappyCompression>
+ Can be combined with --command=recompress to run recompression for this
+ list of compression types
+
+ --parse_internal_key=<0xKEY>
+ Convenience option to parse an internal key on the command line. Dumps the
+ internal key in hex format {'key' @ SN: type}
+)");
+}
+
+} // namespace
+
+int SSTDumpTool::Run(int argc, char** argv, Options options) {
+ const char* env_uri = nullptr;
+ const char* dir_or_file = nullptr;
+ uint64_t read_num = std::numeric_limits<uint64_t>::max();
+ std::string command;
+
+ char junk;
+ uint64_t n;
+ bool verify_checksum = false;
+ bool output_hex = false;
+ bool decode_blob_index = false;
+ bool input_key_hex = false;
+ bool has_from = false;
+ bool has_to = false;
+ bool use_from_as_prefix = false;
+ bool show_properties = false;
+ bool show_summary = false;
+ bool set_block_size = false;
+ std::string from_key;
+ std::string to_key;
+ std::string block_size_str;
+ size_t block_size = 0;
+ std::vector<std::pair<CompressionType, const char*>> compression_types;
+ uint64_t total_num_files = 0;
+ uint64_t total_num_data_blocks = 0;
+ uint64_t total_data_block_size = 0;
+ uint64_t total_index_block_size = 0;
+ uint64_t total_filter_block_size = 0;
+ for (int i = 1; i < argc; i++) {
+ if (strncmp(argv[i], "--env_uri=", 10) == 0) {
+ env_uri = argv[i] + 10;
+ } else if (strncmp(argv[i], "--file=", 7) == 0) {
+ dir_or_file = argv[i] + 7;
+ } else if (strcmp(argv[i], "--output_hex") == 0) {
+ output_hex = true;
+ } else if (strcmp(argv[i], "--decode_blob_index") == 0) {
+ decode_blob_index = true;
+ } else if (strcmp(argv[i], "--input_key_hex") == 0) {
+ input_key_hex = true;
+ } else if (sscanf(argv[i], "--read_num=%lu%c", (unsigned long*)&n, &junk) ==
+ 1) {
+ read_num = n;
+ } else if (strcmp(argv[i], "--verify_checksum") == 0) {
+ verify_checksum = true;
+ } else if (strncmp(argv[i], "--command=", 10) == 0) {
+ command = argv[i] + 10;
+ } else if (strncmp(argv[i], "--from=", 7) == 0) {
+ from_key = argv[i] + 7;
+ has_from = true;
+ } else if (strncmp(argv[i], "--to=", 5) == 0) {
+ to_key = argv[i] + 5;
+ has_to = true;
+ } else if (strncmp(argv[i], "--prefix=", 9) == 0) {
+ from_key = argv[i] + 9;
+ use_from_as_prefix = true;
+ } else if (strcmp(argv[i], "--show_properties") == 0) {
+ show_properties = true;
+ } else if (strcmp(argv[i], "--show_summary") == 0) {
+ show_summary = true;
+ } else if (strncmp(argv[i], "--set_block_size=", 17) == 0) {
+ set_block_size = true;
+ block_size_str = argv[i] + 17;
+ std::istringstream iss(block_size_str);
+ iss >> block_size;
+ if (iss.fail()) {
+ fprintf(stderr, "block size must be numeric\n");
+ exit(1);
+ }
+ } else if (strncmp(argv[i], "--compression_types=", 20) == 0) {
+ std::string compression_types_csv = argv[i] + 20;
+ std::istringstream iss(compression_types_csv);
+ std::string compression_type;
+ while (std::getline(iss, compression_type, ',')) {
+ auto iter = std::find_if(
+ kCompressions.begin(), kCompressions.end(),
+ [&compression_type](std::pair<CompressionType, const char*> curr) {
+ return curr.second == compression_type;
+ });
+ if (iter == kCompressions.end()) {
+ fprintf(stderr, "%s is not a valid CompressionType\n",
+ compression_type.c_str());
+ exit(1);
+ }
+ compression_types.emplace_back(*iter);
+ }
+ } else if (strncmp(argv[i], "--parse_internal_key=", 21) == 0) {
+ std::string in_key(argv[i] + 21);
+ try {
+ in_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(in_key);
+ } catch (...) {
+ std::cerr << "ERROR: Invalid key input '"
+ << in_key
+ << "' Use 0x{hex representation of internal rocksdb key}" << std::endl;
+ return -1;
+ }
+ Slice sl_key = ROCKSDB_NAMESPACE::Slice(in_key);
+ ParsedInternalKey ikey;
+ int retc = 0;
+ if (!ParseInternalKey(sl_key, &ikey)) {
+ std::cerr << "Internal Key [" << sl_key.ToString(true /* in hex*/)
+ << "] parse error!\n";
+ retc = -1;
+ }
+ fprintf(stdout, "key=%s\n", ikey.DebugString(true).c_str());
+ return retc;
+ } else {
+ fprintf(stderr, "Unrecognized argument '%s'\n\n", argv[i]);
+ print_help();
+ exit(1);
+ }
+ }
+
+ if (use_from_as_prefix && has_from) {
+ fprintf(stderr, "Cannot specify --prefix and --from\n\n");
+ exit(1);
+ }
+
+ if (input_key_hex) {
+ if (has_from || use_from_as_prefix) {
+ from_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(from_key);
+ }
+ if (has_to) {
+ to_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(to_key);
+ }
+ }
+
+ if (dir_or_file == nullptr) {
+ fprintf(stderr, "file or directory must be specified.\n\n");
+ print_help();
+ exit(1);
+ }
+
+ std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
+
+ // If caller of SSTDumpTool::Run(...) does not specify a different env other
+ // than Env::Default(), then try to load custom env based on dir_or_file.
+ // Otherwise, the caller is responsible for creating custom env.
+ if (!options.env || options.env == ROCKSDB_NAMESPACE::Env::Default()) {
+ Env* env = Env::Default();
+ Status s = Env::LoadEnv(env_uri ? env_uri : "", &env, &env_guard);
+ if (!s.ok() && !s.IsNotFound()) {
+ fprintf(stderr, "LoadEnv: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ options.env = env;
+ } else {
+ fprintf(stdout, "options.env is %p\n", options.env);
+ }
+
+ std::vector<std::string> filenames;
+ ROCKSDB_NAMESPACE::Env* env = options.env;
+ ROCKSDB_NAMESPACE::Status st = env->GetChildren(dir_or_file, &filenames);
+ bool dir = true;
+ if (!st.ok()) {
+ filenames.clear();
+ filenames.push_back(dir_or_file);
+ dir = false;
+ }
+
+ fprintf(stdout, "from [%s] to [%s]\n",
+ ROCKSDB_NAMESPACE::Slice(from_key).ToString(true).c_str(),
+ ROCKSDB_NAMESPACE::Slice(to_key).ToString(true).c_str());
+
+ uint64_t total_read = 0;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ std::string filename = filenames.at(i);
+ if (filename.length() <= 4 ||
+ filename.rfind(".sst") != filename.length() - 4) {
+ // ignore
+ continue;
+ }
+ if (dir) {
+ filename = std::string(dir_or_file) + "/" + filename;
+ }
+
+ ROCKSDB_NAMESPACE::SstFileDumper dumper(options, filename, verify_checksum,
+ output_hex, decode_blob_index);
+ if (!dumper.getStatus().ok()) {
+ fprintf(stderr, "%s: %s\n", filename.c_str(),
+ dumper.getStatus().ToString().c_str());
+ continue;
+ }
+
+ if (command == "recompress") {
+ dumper.ShowAllCompressionSizes(
+ set_block_size ? block_size : 16384,
+ compression_types.empty() ? kCompressions : compression_types);
+ return 0;
+ }
+
+ if (command == "raw") {
+ std::string out_filename = filename.substr(0, filename.length() - 4);
+ out_filename.append("_dump.txt");
+
+ st = dumper.DumpTable(out_filename);
+ if (!st.ok()) {
+ fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
+ exit(1);
+ } else {
+ fprintf(stdout, "raw dump written to file %s\n", &out_filename[0]);
+ }
+ continue;
+ }
+
+ // scan all files in give file path.
+ if (command == "" || command == "scan" || command == "check") {
+ st = dumper.ReadSequential(
+ command == "scan", read_num > 0 ? (read_num - total_read) : read_num,
+ has_from || use_from_as_prefix, from_key, has_to, to_key,
+ use_from_as_prefix);
+ if (!st.ok()) {
+ fprintf(stderr, "%s: %s\n", filename.c_str(),
+ st.ToString().c_str());
+ }
+ total_read += dumper.GetReadNumber();
+ if (read_num > 0 && total_read > read_num) {
+ break;
+ }
+ }
+
+ if (command == "verify") {
+ st = dumper.VerifyChecksum();
+ if (!st.ok()) {
+ fprintf(stderr, "%s is corrupted: %s\n", filename.c_str(),
+ st.ToString().c_str());
+ } else {
+ fprintf(stdout, "The file is ok\n");
+ }
+ continue;
+ }
+
+ if (show_properties || show_summary) {
+ const ROCKSDB_NAMESPACE::TableProperties* table_properties;
+
+ std::shared_ptr<const ROCKSDB_NAMESPACE::TableProperties>
+ table_properties_from_reader;
+ st = dumper.ReadTableProperties(&table_properties_from_reader);
+ if (!st.ok()) {
+ fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
+ fprintf(stderr, "Try to use initial table properties\n");
+ table_properties = dumper.GetInitTableProperties();
+ } else {
+ table_properties = table_properties_from_reader.get();
+ }
+ if (table_properties != nullptr) {
+ if (show_properties) {
+ fprintf(stdout,
+ "Table Properties:\n"
+ "------------------------------\n"
+ " %s",
+ table_properties->ToString("\n ", ": ").c_str());
+ }
+ total_num_files += 1;
+ total_num_data_blocks += table_properties->num_data_blocks;
+ total_data_block_size += table_properties->data_size;
+ total_index_block_size += table_properties->index_size;
+ total_filter_block_size += table_properties->filter_size;
+ if (show_properties) {
+ fprintf(stdout,
+ "Raw user collected properties\n"
+ "------------------------------\n");
+ for (const auto& kv : table_properties->user_collected_properties) {
+ std::string prop_name = kv.first;
+ std::string prop_val = Slice(kv.second).ToString(true);
+ fprintf(stdout, " # %s: 0x%s\n", prop_name.c_str(),
+ prop_val.c_str());
+ }
+ }
+ } else {
+ fprintf(stderr, "Reader unexpectedly returned null properties\n");
+ }
+ }
+ }
+ if (show_summary) {
+ fprintf(stdout, "total number of files: %" PRIu64 "\n", total_num_files);
+ fprintf(stdout, "total number of data blocks: %" PRIu64 "\n",
+ total_num_data_blocks);
+ fprintf(stdout, "total data block size: %" PRIu64 "\n",
+ total_data_block_size);
+ fprintf(stdout, "total index block size: %" PRIu64 "\n",
+ total_index_block_size);
+ fprintf(stdout, "total filter block size: %" PRIu64 "\n",
+ total_filter_block_size);
+ }
+ return 0;
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/sst_dump_tool_imp.h b/src/rocksdb/tools/sst_dump_tool_imp.h
new file mode 100644
index 000000000..28c217ef2
--- /dev/null
+++ b/src/rocksdb/tools/sst_dump_tool_imp.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/sst_dump_tool.h"
+
+#include <memory>
+#include <string>
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SstFileDumper {
+ public:
+ explicit SstFileDumper(const Options& options, const std::string& file_name,
+ bool verify_checksum, bool output_hex,
+ bool decode_blob_index);
+
+ Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from,
+ const std::string& from_key, bool has_to,
+ const std::string& to_key,
+ bool use_from_as_prefix = false);
+
+ Status ReadTableProperties(
+ std::shared_ptr<const TableProperties>* table_properties);
+ uint64_t GetReadNumber() { return read_num_; }
+ TableProperties* GetInitTableProperties() { return table_properties_.get(); }
+
+ Status VerifyChecksum();
+ Status DumpTable(const std::string& out_filename);
+ Status getStatus() { return init_result_; }
+
+ int ShowAllCompressionSizes(
+ size_t block_size,
+ const std::vector<std::pair<CompressionType, const char*>>&
+ compression_types);
+
+ private:
+ // Get the TableReader implementation for the sst file
+ Status GetTableReader(const std::string& file_path);
+ Status ReadTableProperties(uint64_t table_magic_number,
+ RandomAccessFileReader* file, uint64_t file_size);
+
+ uint64_t CalculateCompressedTableSize(const TableBuilderOptions& tb_options,
+ size_t block_size,
+ uint64_t* num_data_blocks);
+
+ Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
+ Status SetOldTableOptions();
+
+ // Helper function to call the factory with settings specific to the
+ // factory implementation
+ Status NewTableReader(const ImmutableCFOptions& ioptions,
+ const EnvOptions& soptions,
+ const InternalKeyComparator& internal_comparator,
+ uint64_t file_size,
+ std::unique_ptr<TableReader>* table_reader);
+
+ std::string file_name_;
+ uint64_t read_num_;
+ bool verify_checksum_;
+ bool output_hex_;
+ bool decode_blob_index_;
+ EnvOptions soptions_;
+
+ // options_ and internal_comparator_ will also be used in
+ // ReadSequential internally (specifically, seek-related operations)
+ Options options_;
+
+ Status init_result_;
+ std::unique_ptr<TableReader> table_reader_;
+ std::unique_ptr<RandomAccessFileReader> file_;
+
+ const ImmutableCFOptions ioptions_;
+ const MutableCFOptions moptions_;
+ InternalKeyComparator internal_comparator_;
+ std::unique_ptr<TableProperties> table_properties_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/trace_analyzer.cc b/src/rocksdb/tools/trace_analyzer.cc
new file mode 100644
index 000000000..958078d1c
--- /dev/null
+++ b/src/rocksdb/tools/trace_analyzer.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+ return 1;
+}
+#else
+#include "tools/trace_analyzer_tool.h"
+int main(int argc, char** argv) {
+ return ROCKSDB_NAMESPACE::trace_analyzer_tool(argc, argv);
+}
+#endif
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "Not supported in lite mode.\n");
+ return 1;
+}
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/trace_analyzer_test.cc b/src/rocksdb/tools/trace_analyzer_test.cc
new file mode 100644
index 000000000..d98ea2e80
--- /dev/null
+++ b/src/rocksdb/tools/trace_analyzer_test.cc
@@ -0,0 +1,727 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run trace_analyzer test\n");
+ return 1;
+}
+#else
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <sstream>
+#include <thread>
+
+#include "db/db_test_util.h"
+#include "file/read_write_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "tools/trace_analyzer_tool.h"
+#include "trace_replay/trace_replay.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+static const int kMaxArgCount = 100;
+static const size_t kArgBufferSize = 100000;
+} // namespace
+
+// The helper functions for the test
+class TraceAnalyzerTest : public testing::Test {
+ public:
+ TraceAnalyzerTest() : rnd_(0xFB) {
+ // test_path_ = test::TmpDir() + "trace_analyzer_test";
+ test_path_ = test::PerThreadDBPath("trace_analyzer_test");
+ env_ = ROCKSDB_NAMESPACE::Env::Default();
+ env_->CreateDir(test_path_);
+ dbname_ = test_path_ + "/db";
+ }
+
+ ~TraceAnalyzerTest() override {}
+
+ void GenerateTrace(std::string trace_path) {
+ Options options;
+ options.create_if_missing = true;
+ options.merge_operator = MergeOperators::CreatePutOperator();
+ ReadOptions ro;
+ WriteOptions wo;
+ TraceOptions trace_opt;
+ DB* db_ = nullptr;
+ std::string value;
+ std::unique_ptr<TraceWriter> trace_writer;
+ Iterator* single_iter = nullptr;
+
+ ASSERT_OK(
+ NewFileTraceWriter(env_, env_options_, trace_path, &trace_writer));
+ ASSERT_OK(DB::Open(options, dbname_, &db_));
+ ASSERT_OK(db_->StartTrace(trace_opt, std::move(trace_writer)));
+
+ WriteBatch batch;
+ ASSERT_OK(batch.Put("a", "aaaaaaaaa"));
+ ASSERT_OK(batch.Merge("b", "aaaaaaaaaaaaaaaaaaaa"));
+ ASSERT_OK(batch.Delete("c"));
+ ASSERT_OK(batch.SingleDelete("d"));
+ ASSERT_OK(batch.DeleteRange("e", "f"));
+ ASSERT_OK(db_->Write(wo, &batch));
+
+ ASSERT_OK(db_->Get(ro, "a", &value));
+ single_iter = db_->NewIterator(ro);
+ single_iter->Seek("a");
+ single_iter->SeekForPrev("b");
+ delete single_iter;
+ std::this_thread::sleep_for (std::chrono::seconds(1));
+
+ db_->Get(ro, "g", &value);
+
+ ASSERT_OK(db_->EndTrace());
+
+ ASSERT_OK(env_->FileExists(trace_path));
+
+ std::unique_ptr<WritableFile> whole_f;
+ std::string whole_path = test_path_ + "/0.txt";
+ ASSERT_OK(env_->NewWritableFile(whole_path, &whole_f, env_options_));
+ std::string whole_str = "0x61\n0x62\n0x63\n0x64\n0x65\n0x66\n";
+ ASSERT_OK(whole_f->Append(whole_str));
+ delete db_;
+ ASSERT_OK(DestroyDB(dbname_, options));
+ }
+
+ void RunTraceAnalyzer(const std::vector<std::string>& args) {
+ char arg_buffer[kArgBufferSize];
+ char* argv[kMaxArgCount];
+ int argc = 0;
+ int cursor = 0;
+
+ for (const auto& arg : args) {
+ ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+ ASSERT_LE(argc + 1, kMaxArgCount);
+ snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+ argv[argc++] = arg_buffer + cursor;
+ cursor += static_cast<int>(arg.size()) + 1;
+ }
+
+ ASSERT_EQ(0, ROCKSDB_NAMESPACE::trace_analyzer_tool(argc, argv));
+ }
+
+ void CheckFileContent(const std::vector<std::string>& cnt,
+ std::string file_path, bool full_content) {
+ ASSERT_OK(env_->FileExists(file_path));
+ std::unique_ptr<SequentialFile> f_ptr;
+ ASSERT_OK(env_->NewSequentialFile(file_path, &f_ptr, env_options_));
+
+ std::string get_line;
+ std::istringstream iss;
+ bool has_data = true;
+ std::vector<std::string> result;
+ uint32_t count;
+ Status s;
+ std::unique_ptr<FSSequentialFile> file =
+ NewLegacySequentialFileWrapper(f_ptr);
+ SequentialFileReader sf_reader(std::move(file), file_path,
+ 4096 /* filereadahead_size */);
+
+ for (count = 0; ReadOneLine(&iss, &sf_reader, &get_line, &has_data, &s);
+ ++count) {
+ ASSERT_OK(s);
+ result.push_back(get_line);
+ }
+
+ ASSERT_EQ(cnt.size(), result.size());
+ for (int i = 0; i < static_cast<int>(result.size()); i++) {
+ if (full_content) {
+ ASSERT_EQ(result[i], cnt[i]);
+ } else {
+ ASSERT_EQ(result[i][0], cnt[i][0]);
+ }
+ }
+
+ return;
+ }
+
+ void AnalyzeTrace(std::vector<std::string>& paras_diff,
+ std::string output_path, std::string trace_path) {
+ std::vector<std::string> paras = {"./trace_analyzer",
+ "-convert_to_human_readable_trace",
+ "-output_key_stats",
+ "-output_access_count_stats",
+ "-output_prefix=test",
+ "-output_prefix_cut=1",
+ "-output_time_series",
+ "-output_value_distribution",
+ "-output_qps_stats",
+ "-no_key",
+ "-no_print"};
+ for (auto& para : paras_diff) {
+ paras.push_back(para);
+ }
+ Status s = env_->FileExists(trace_path);
+ if (!s.ok()) {
+ GenerateTrace(trace_path);
+ }
+ env_->CreateDir(output_path);
+ RunTraceAnalyzer(paras);
+ }
+
+ ROCKSDB_NAMESPACE::Env* env_;
+ EnvOptions env_options_;
+ std::string test_path_;
+ std::string dbname_;
+ Random rnd_;
+};
+
+TEST_F(TraceAnalyzerTest, Get) {
+ std::string trace_path = test_path_ + "/trace";
+ std::string output_path = test_path_ + "/get";
+ std::string file_path;
+ std::vector<std::string> paras = {"-analyze_get"};
+ paras.push_back("-output_dir=" + output_path);
+ paras.push_back("-trace_path=" + trace_path);
+ paras.push_back("-key_space_dir=" + test_path_);
+ AnalyzeTrace(paras, output_path, trace_path);
+
+ // check the key_stats file
+ std::vector<std::string> k_stats = {"0 10 0 1 1.000000", "0 10 1 1 1.000000"};
+ file_path = output_path + "/test-get-0-accessed_key_stats.txt";
+ CheckFileContent(k_stats, file_path, true);
+
+ // Check the access count distribution
+ std::vector<std::string> k_dist = {"access_count: 1 num: 2"};
+ file_path = output_path + "/test-get-0-accessed_key_count_distribution.txt";
+ CheckFileContent(k_dist, file_path, true);
+
+ // Check the trace sequence
+ std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+ "0", "6", "7", "0"};
+ file_path = output_path + "/test-human_readable_trace.txt";
+ CheckFileContent(k_sequence, file_path, false);
+
+ // Check the prefix
+ std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30",
+ "1 1 1 1.000000 1.000000 0x61"};
+ file_path = output_path + "/test-get-0-accessed_key_prefix_cut.txt";
+ CheckFileContent(k_prefix, file_path, true);
+
+ // Check the time series
+ std::vector<std::string> k_series = {"0 1533000630 0", "0 1533000630 1"};
+ file_path = output_path + "/test-get-0-time_series.txt";
+ CheckFileContent(k_series, file_path, false);
+
+ // Check the accessed key in whole key space
+ std::vector<std::string> k_whole_access = {"0 1"};
+ file_path = output_path + "/test-get-0-whole_key_stats.txt";
+ CheckFileContent(k_whole_access, file_path, true);
+
+ // Check the whole key prefix cut
+ std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+ "3 0x64", "4 0x65", "5 0x66"};
+ file_path = output_path + "/test-get-0-whole_key_prefix_cut.txt";
+ CheckFileContent(k_whole_prefix, file_path, true);
+
+ // Check the overall qps
+ std::vector<std::string> all_qps = {"1 0 0 0 0 0 0 0 1"};
+ file_path = output_path + "/test-qps_stats.txt";
+ CheckFileContent(all_qps, file_path, true);
+
+ // Check the qps of get
+ std::vector<std::string> get_qps = {"1"};
+ file_path = output_path + "/test-get-0-qps_stats.txt";
+ CheckFileContent(get_qps, file_path, true);
+
+ // Check the top k qps prefix cut
+ std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+ "The prefix: 0x61 Access count: 1"};
+ file_path = output_path + "/test-get-0-accessed_top_k_qps_prefix_cut.txt";
+ CheckFileContent(top_qps, file_path, true);
+}
+
+// Test analyzing of Put
+TEST_F(TraceAnalyzerTest, Put) {
+ std::string trace_path = test_path_ + "/trace";
+ std::string output_path = test_path_ + "/put";
+ std::string file_path;
+ std::vector<std::string> paras = {"-analyze_put"};
+ paras.push_back("-output_dir=" + output_path);
+ paras.push_back("-trace_path=" + trace_path);
+ paras.push_back("-key_space_dir=" + test_path_);
+ AnalyzeTrace(paras, output_path, trace_path);
+
+ // check the key_stats file
+ std::vector<std::string> k_stats = {"0 9 0 1 1.000000"};
+ file_path = output_path + "/test-put-0-accessed_key_stats.txt";
+ CheckFileContent(k_stats, file_path, true);
+
+ // Check the access count distribution
+ std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+ file_path = output_path + "/test-put-0-accessed_key_count_distribution.txt";
+ CheckFileContent(k_dist, file_path, true);
+
+ // Check the trace sequence
+ std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+ "0", "6", "7", "0"};
+ file_path = output_path + "/test-human_readable_trace.txt";
+ CheckFileContent(k_sequence, file_path, false);
+
+ // Check the prefix
+ std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+ file_path = output_path + "/test-put-0-accessed_key_prefix_cut.txt";
+ CheckFileContent(k_prefix, file_path, true);
+
+ // Check the time series
+ std::vector<std::string> k_series = {"1 1533056278 0"};
+ file_path = output_path + "/test-put-0-time_series.txt";
+ CheckFileContent(k_series, file_path, false);
+
+ // Check the accessed key in whole key space
+ std::vector<std::string> k_whole_access = {"0 1"};
+ file_path = output_path + "/test-put-0-whole_key_stats.txt";
+ CheckFileContent(k_whole_access, file_path, true);
+
+ // Check the whole key prefix cut
+ std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+ "3 0x64", "4 0x65", "5 0x66"};
+ file_path = output_path + "/test-put-0-whole_key_prefix_cut.txt";
+ CheckFileContent(k_whole_prefix, file_path, true);
+
+ // Check the overall qps
+ std::vector<std::string> all_qps = {"1 1 0 0 0 0 0 0 2"};
+ file_path = output_path + "/test-qps_stats.txt";
+ CheckFileContent(all_qps, file_path, true);
+
+ // Check the qps of Put
+ std::vector<std::string> get_qps = {"1"};
+ file_path = output_path + "/test-put-0-qps_stats.txt";
+ CheckFileContent(get_qps, file_path, true);
+
+ // Check the top k qps prefix cut
+ std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+ "The prefix: 0x61 Access count: 1"};
+ file_path = output_path + "/test-put-0-accessed_top_k_qps_prefix_cut.txt";
+ CheckFileContent(top_qps, file_path, true);
+
+ // Check the value size distribution
+ std::vector<std::string> value_dist = {
+ "Number_of_value_size_between 0 and 16 is: 1"};
+ file_path = output_path + "/test-put-0-accessed_value_size_distribution.txt";
+ CheckFileContent(value_dist, file_path, true);
+}
+
+// Test analyzing of delete
+TEST_F(TraceAnalyzerTest, Delete) {
+ std::string trace_path = test_path_ + "/trace";
+ std::string output_path = test_path_ + "/delete";
+ std::string file_path;
+ std::vector<std::string> paras = {"-analyze_delete"};
+ paras.push_back("-output_dir=" + output_path);
+ paras.push_back("-trace_path=" + trace_path);
+ paras.push_back("-key_space_dir=" + test_path_);
+ AnalyzeTrace(paras, output_path, trace_path);
+
+ // check the key_stats file
+ std::vector<std::string> k_stats = {"0 0 0 1 1.000000"};
+ file_path = output_path + "/test-delete-0-accessed_key_stats.txt";
+ CheckFileContent(k_stats, file_path, true);
+
+ // Check the access count distribution
+ std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+ file_path =
+ output_path + "/test-delete-0-accessed_key_count_distribution.txt";
+ CheckFileContent(k_dist, file_path, true);
+
+ // Check the trace sequence
+ std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+ "0", "6", "7", "0"};
+ file_path = output_path + "/test-human_readable_trace.txt";
+ CheckFileContent(k_sequence, file_path, false);
+
+ // Check the prefix
+ std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+ file_path = output_path + "/test-delete-0-accessed_key_prefix_cut.txt";
+ CheckFileContent(k_prefix, file_path, true);
+
+ // Check the time series
+ std::vector<std::string> k_series = {"2 1533000630 0"};
+ file_path = output_path + "/test-delete-0-time_series.txt";
+ CheckFileContent(k_series, file_path, false);
+
+ // Check the accessed key in whole key space
+ std::vector<std::string> k_whole_access = {"2 1"};
+ file_path = output_path + "/test-delete-0-whole_key_stats.txt";
+ CheckFileContent(k_whole_access, file_path, true);
+
+ // Check the whole key prefix cut
+ std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+ "3 0x64", "4 0x65", "5 0x66"};
+ file_path = output_path + "/test-delete-0-whole_key_prefix_cut.txt";
+ CheckFileContent(k_whole_prefix, file_path, true);
+
+ // Check the overall qps
+ std::vector<std::string> all_qps = {"1 1 1 0 0 0 0 0 3"};
+ file_path = output_path + "/test-qps_stats.txt";
+ CheckFileContent(all_qps, file_path, true);
+
+ // Check the qps of Delete
+ std::vector<std::string> get_qps = {"1"};
+ file_path = output_path + "/test-delete-0-qps_stats.txt";
+ CheckFileContent(get_qps, file_path, true);
+
+ // Check the top k qps prefix cut
+ std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+ "The prefix: 0x63 Access count: 1"};
+ file_path = output_path + "/test-delete-0-accessed_top_k_qps_prefix_cut.txt";
+ CheckFileContent(top_qps, file_path, true);
+}
+
+// Test analyzing of Merge
+TEST_F(TraceAnalyzerTest, Merge) {
+ std::string trace_path = test_path_ + "/trace";
+ std::string output_path = test_path_ + "/merge";
+ std::string file_path;
+ std::vector<std::string> paras = {"-analyze_merge"};
+ paras.push_back("-output_dir=" + output_path);
+ paras.push_back("-trace_path=" + trace_path);
+ paras.push_back("-key_space_dir=" + test_path_);
+ AnalyzeTrace(paras, output_path, trace_path);
+
+ // check the key_stats file
+ std::vector<std::string> k_stats = {"0 20 0 1 1.000000"};
+ file_path = output_path + "/test-merge-0-accessed_key_stats.txt";
+ CheckFileContent(k_stats, file_path, true);
+
+ // Check the access count distribution
+ std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+ file_path = output_path + "/test-merge-0-accessed_key_count_distribution.txt";
+ CheckFileContent(k_dist, file_path, true);
+
+ // Check the trace sequence
+ std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+ "0", "6", "7", "0"};
+ file_path = output_path + "/test-human_readable_trace.txt";
+ CheckFileContent(k_sequence, file_path, false);
+
+ // Check the prefix
+ std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+ file_path = output_path + "/test-merge-0-accessed_key_prefix_cut.txt";
+ CheckFileContent(k_prefix, file_path, true);
+
+ // Check the time series
+ std::vector<std::string> k_series = {"5 1533000630 0"};
+ file_path = output_path + "/test-merge-0-time_series.txt";
+ CheckFileContent(k_series, file_path, false);
+
+ // Check the accessed key in whole key space
+ std::vector<std::string> k_whole_access = {"1 1"};
+ file_path = output_path + "/test-merge-0-whole_key_stats.txt";
+ CheckFileContent(k_whole_access, file_path, true);
+
+ // Check the whole key prefix cut
+ std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+ "3 0x64", "4 0x65", "5 0x66"};
+ file_path = output_path + "/test-merge-0-whole_key_prefix_cut.txt";
+ CheckFileContent(k_whole_prefix, file_path, true);
+
+ // Check the overall qps
+ std::vector<std::string> all_qps = {"1 1 1 0 0 1 0 0 4"};
+ file_path = output_path + "/test-qps_stats.txt";
+ CheckFileContent(all_qps, file_path, true);
+
+ // Check the qps of Merge
+ std::vector<std::string> get_qps = {"1"};
+ file_path = output_path + "/test-merge-0-qps_stats.txt";
+ CheckFileContent(get_qps, file_path, true);
+
+ // Check the top k qps prefix cut
+ std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+ "The prefix: 0x62 Access count: 1"};
+ file_path = output_path + "/test-merge-0-accessed_top_k_qps_prefix_cut.txt";
+ CheckFileContent(top_qps, file_path, true);
+
+ // Check the value size distribution
+ std::vector<std::string> value_dist = {
+ "Number_of_value_size_between 0 and 24 is: 1"};
+ file_path =
+ output_path + "/test-merge-0-accessed_value_size_distribution.txt";
+ CheckFileContent(value_dist, file_path, true);
+}
+
+// Test analyzing of SingleDelete
+TEST_F(TraceAnalyzerTest, SingleDelete) {
+ std::string trace_path = test_path_ + "/trace";
+ std::string output_path = test_path_ + "/single_delete";
+ std::string file_path;
+ std::vector<std::string> paras = {"-analyze_single_delete"};
+ paras.push_back("-output_dir=" + output_path);
+ paras.push_back("-trace_path=" + trace_path);
+ paras.push_back("-key_space_dir=" + test_path_);
+ AnalyzeTrace(paras, output_path, trace_path);
+
+ // check the key_stats file
+ std::vector<std::string> k_stats = {"0 0 0 1 1.000000"};
+ file_path = output_path + "/test-single_delete-0-accessed_key_stats.txt";
+ CheckFileContent(k_stats, file_path, true);
+
+ // Check the access count distribution
+ std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+ file_path =
+ output_path + "/test-single_delete-0-accessed_key_count_distribution.txt";
+ CheckFileContent(k_dist, file_path, true);
+
+ // Check the trace sequence
+ std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+ "0", "6", "7", "0"};
+ file_path = output_path + "/test-human_readable_trace.txt";
+ CheckFileContent(k_sequence, file_path, false);
+
+ // Check the prefix
+ std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+ file_path = output_path + "/test-single_delete-0-accessed_key_prefix_cut.txt";
+ CheckFileContent(k_prefix, file_path, true);
+
+ // Check the time series
+ std::vector<std::string> k_series = {"3 1533000630 0"};
+ file_path = output_path + "/test-single_delete-0-time_series.txt";
+ CheckFileContent(k_series, file_path, false);
+
+ // Check the accessed key in whole key space
+ std::vector<std::string> k_whole_access = {"3 1"};
+ file_path = output_path + "/test-single_delete-0-whole_key_stats.txt";
+ CheckFileContent(k_whole_access, file_path, true);
+
+ // Check the whole key prefix cut
+ std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+ "3 0x64", "4 0x65", "5 0x66"};
+ file_path = output_path + "/test-single_delete-0-whole_key_prefix_cut.txt";
+ CheckFileContent(k_whole_prefix, file_path, true);
+
+ // Check the overall qps
+ std::vector<std::string> all_qps = {"1 1 1 1 0 1 0 0 5"};
+ file_path = output_path + "/test-qps_stats.txt";
+ CheckFileContent(all_qps, file_path, true);
+
+ // Check the qps of SingleDelete
+ std::vector<std::string> get_qps = {"1"};
+ file_path = output_path + "/test-single_delete-0-qps_stats.txt";
+ CheckFileContent(get_qps, file_path, true);
+
+ // Check the top k qps prefix cut
+ std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+ "The prefix: 0x64 Access count: 1"};
+ file_path =
+ output_path + "/test-single_delete-0-accessed_top_k_qps_prefix_cut.txt";
+ CheckFileContent(top_qps, file_path, true);
+}
+
+// Test analyzing of delete
+TEST_F(TraceAnalyzerTest, DeleteRange) {
+ std::string trace_path = test_path_ + "/trace";
+ std::string output_path = test_path_ + "/range_delete";
+ std::string file_path;
+ std::vector<std::string> paras = {"-analyze_range_delete"};
+ paras.push_back("-output_dir=" + output_path);
+ paras.push_back("-trace_path=" + trace_path);
+ paras.push_back("-key_space_dir=" + test_path_);
+ AnalyzeTrace(paras, output_path, trace_path);
+
+ // check the key_stats file
+ std::vector<std::string> k_stats = {"0 0 0 1 1.000000", "0 0 1 1 1.000000"};
+ file_path = output_path + "/test-range_delete-0-accessed_key_stats.txt";
+ CheckFileContent(k_stats, file_path, true);
+
+ // Check the access count distribution
+ std::vector<std::string> k_dist = {"access_count: 1 num: 2"};
+ file_path =
+ output_path + "/test-range_delete-0-accessed_key_count_distribution.txt";
+ CheckFileContent(k_dist, file_path, true);
+
+ // Check the trace sequence
+ std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+ "0", "6", "7", "0"};
+ file_path = output_path + "/test-human_readable_trace.txt";
+ CheckFileContent(k_sequence, file_path, false);
+
+ // Check the prefix
+ std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30",
+ "1 1 1 1.000000 1.000000 0x65"};
+ file_path = output_path + "/test-range_delete-0-accessed_key_prefix_cut.txt";
+ CheckFileContent(k_prefix, file_path, true);
+
+ // Check the time series
+ std::vector<std::string> k_series = {"4 1533000630 0", "4 1533060100 1"};
+ file_path = output_path + "/test-range_delete-0-time_series.txt";
+ CheckFileContent(k_series, file_path, false);
+
+ // Check the accessed key in whole key space
+ std::vector<std::string> k_whole_access = {"4 1", "5 1"};
+ file_path = output_path + "/test-range_delete-0-whole_key_stats.txt";
+ CheckFileContent(k_whole_access, file_path, true);
+
+ // Check the whole key prefix cut
+ std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+ "3 0x64", "4 0x65", "5 0x66"};
+ file_path = output_path + "/test-range_delete-0-whole_key_prefix_cut.txt";
+ CheckFileContent(k_whole_prefix, file_path, true);
+
+ // Check the overall qps
+ std::vector<std::string> all_qps = {"1 1 1 1 2 1 0 0 7"};
+ file_path = output_path + "/test-qps_stats.txt";
+ CheckFileContent(all_qps, file_path, true);
+
+ // Check the qps of DeleteRange
+ std::vector<std::string> get_qps = {"2"};
+ file_path = output_path + "/test-range_delete-0-qps_stats.txt";
+ CheckFileContent(get_qps, file_path, true);
+
+ // Check the top k qps prefix cut
+ std::vector<std::string> top_qps = {"At time: 0 with QPS: 2",
+ "The prefix: 0x65 Access count: 1",
+ "The prefix: 0x66 Access count: 1"};
+ file_path =
+ output_path + "/test-range_delete-0-accessed_top_k_qps_prefix_cut.txt";
+ CheckFileContent(top_qps, file_path, true);
+}
+
+// Test analyzing of Iterator
+TEST_F(TraceAnalyzerTest, Iterator) {
+ std::string trace_path = test_path_ + "/trace";
+ std::string output_path = test_path_ + "/iterator";
+ std::string file_path;
+ std::vector<std::string> paras = {"-analyze_iterator"};
+ paras.push_back("-output_dir=" + output_path);
+ paras.push_back("-trace_path=" + trace_path);
+ paras.push_back("-key_space_dir=" + test_path_);
+ AnalyzeTrace(paras, output_path, trace_path);
+
+ // Check the output of Seek
+ // check the key_stats file
+ std::vector<std::string> k_stats = {"0 0 0 1 1.000000"};
+ file_path = output_path + "/test-iterator_Seek-0-accessed_key_stats.txt";
+ CheckFileContent(k_stats, file_path, true);
+
+ // Check the access count distribution
+ std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+ file_path =
+ output_path + "/test-iterator_Seek-0-accessed_key_count_distribution.txt";
+ CheckFileContent(k_dist, file_path, true);
+
+ // Check the trace sequence
+ std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+ "0", "6", "7", "0"};
+ file_path = output_path + "/test-human_readable_trace.txt";
+ CheckFileContent(k_sequence, file_path, false);
+
+ // Check the prefix
+ std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+ file_path = output_path + "/test-iterator_Seek-0-accessed_key_prefix_cut.txt";
+ CheckFileContent(k_prefix, file_path, true);
+
+ // Check the time series
+ std::vector<std::string> k_series = {"6 1 0"};
+ file_path = output_path + "/test-iterator_Seek-0-time_series.txt";
+ CheckFileContent(k_series, file_path, false);
+
+ // Check the accessed key in whole key space
+ std::vector<std::string> k_whole_access = {"0 1"};
+ file_path = output_path + "/test-iterator_Seek-0-whole_key_stats.txt";
+ CheckFileContent(k_whole_access, file_path, true);
+
+ // Check the whole key prefix cut
+ std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+ "3 0x64", "4 0x65", "5 0x66"};
+ file_path = output_path + "/test-iterator_Seek-0-whole_key_prefix_cut.txt";
+ CheckFileContent(k_whole_prefix, file_path, true);
+
+ // Check the overall qps
+ std::vector<std::string> all_qps = {"1 1 1 1 2 1 1 1 9"};
+ file_path = output_path + "/test-qps_stats.txt";
+ CheckFileContent(all_qps, file_path, true);
+
+ // Check the qps of Iterator_Seek
+ std::vector<std::string> get_qps = {"1"};
+ file_path = output_path + "/test-iterator_Seek-0-qps_stats.txt";
+ CheckFileContent(get_qps, file_path, true);
+
+ // Check the top k qps prefix cut
+ std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+ "The prefix: 0x61 Access count: 1"};
+ file_path =
+ output_path + "/test-iterator_Seek-0-accessed_top_k_qps_prefix_cut.txt";
+ CheckFileContent(top_qps, file_path, true);
+
+ // Check the output of SeekForPrev
+ // check the key_stats file
+ k_stats = {"0 0 0 1 1.000000"};
+ file_path =
+ output_path + "/test-iterator_SeekForPrev-0-accessed_key_stats.txt";
+ CheckFileContent(k_stats, file_path, true);
+
+ // Check the access count distribution
+ k_dist = {"access_count: 1 num: 1"};
+ file_path =
+ output_path +
+ "/test-iterator_SeekForPrev-0-accessed_key_count_distribution.txt";
+ CheckFileContent(k_dist, file_path, true);
+
+ // Check the prefix
+ k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+ file_path =
+ output_path + "/test-iterator_SeekForPrev-0-accessed_key_prefix_cut.txt";
+ CheckFileContent(k_prefix, file_path, true);
+
+ // Check the time series
+ k_series = {"7 0 0"};
+ file_path = output_path + "/test-iterator_SeekForPrev-0-time_series.txt";
+ CheckFileContent(k_series, file_path, false);
+
+ // Check the accessed key in whole key space
+ k_whole_access = {"1 1"};
+ file_path = output_path + "/test-iterator_SeekForPrev-0-whole_key_stats.txt";
+ CheckFileContent(k_whole_access, file_path, true);
+
+ // Check the whole key prefix cut
+ k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63", "3 0x64", "4 0x65", "5 0x66"};
+ file_path =
+ output_path + "/test-iterator_SeekForPrev-0-whole_key_prefix_cut.txt";
+ CheckFileContent(k_whole_prefix, file_path, true);
+
+ // Check the qps of Iterator_SeekForPrev
+ get_qps = {"1"};
+ file_path = output_path + "/test-iterator_SeekForPrev-0-qps_stats.txt";
+ CheckFileContent(get_qps, file_path, true);
+
+ // Check the top k qps prefix cut
+ top_qps = {"At time: 0 with QPS: 1", "The prefix: 0x62 Access count: 1"};
+ file_path = output_path +
+ "/test-iterator_SeekForPrev-0-accessed_top_k_qps_prefix_cut.txt";
+ CheckFileContent(top_qps, file_path, true);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+#endif // GFLAG
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "Trace_analyzer test is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // !ROCKSDB_LITE return RUN_ALL_TESTS();
diff --git a/src/rocksdb/tools/trace_analyzer_tool.cc b/src/rocksdb/tools/trace_analyzer_tool.cc
new file mode 100644
index 000000000..e75845c29
--- /dev/null
+++ b/src/rocksdb/tools/trace_analyzer_tool.cc
@@ -0,0 +1,2001 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#ifdef GFLAGS
+#ifdef NUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+
+#include "db/db_impl/db_impl.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "env/composite_env_wrapper.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "rocksdb/write_batch.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_reader.h"
+#include "tools/trace_analyzer_tool.h"
+#include "trace_replay/trace_replay.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+DEFINE_string(trace_path, "", "The trace file path.");
+DEFINE_string(output_dir, "", "The directory to store the output files.");
+DEFINE_string(output_prefix, "trace",
+ "The prefix used for all the output files.");
+DEFINE_bool(output_key_stats, false,
+ "Output the key access count statistics to file\n"
+ "for accessed keys:\n"
+ "file name: <prefix>-<query_type>-<cf_id>-accessed_key_stats.txt\n"
+ "Format:[cf_id value_size access_keyid access_count]\n"
+ "for the whole key space keys:\n"
+ "File name: <prefix>-<query_type>-<cf_id>-whole_key_stats.txt\n"
+ "Format:[whole_key_space_keyid access_count]");
+DEFINE_bool(output_access_count_stats, false,
+ "Output the access count distribution statistics to file.\n"
+ "File name: <prefix>-<query_type>-<cf_id>-accessed_"
+ "key_count_distribution.txt \n"
+ "Format:[access_count number_of_access_count]");
+DEFINE_bool(output_time_series, false,
+ "Output the access time in second of each key, "
+ "such that we can have the time series data of the queries \n"
+ "File name: <prefix>-<query_type>-<cf_id>-time_series.txt\n"
+ "Format:[type_id time_in_sec access_keyid].");
+DEFINE_bool(try_process_corrupted_trace, false,
+ "In default, trace_analyzer will exit if the trace file is "
+ "corrupted due to the unexpected tracing cases. If this option "
+ "is enabled, trace_analyzer will stop reading the trace file, "
+ "and start analyzing the read-in data.");
+DEFINE_int32(output_prefix_cut, 0,
+ "The number of bytes as prefix to cut the keys.\n"
+ "If it is enabled, it will generate the following:\n"
+ "For accessed keys:\n"
+ "File name: <prefix>-<query_type>-<cf_id>-"
+ "accessed_key_prefix_cut.txt \n"
+ "Format:[acessed_keyid access_count_of_prefix "
+ "number_of_keys_in_prefix average_key_access "
+ "prefix_succ_ratio prefix]\n"
+ "For whole key space keys:\n"
+ "File name: <prefix>-<query_type>-<cf_id>"
+ "-whole_key_prefix_cut.txt\n"
+ "Format:[start_keyid_in_whole_keyspace prefix]\n"
+ "if 'output_qps_stats' and 'top_k' are enabled, it will output:\n"
+ "File name: <prefix>-<query_type>-<cf_id>"
+ "-accessed_top_k_qps_prefix_cut.txt\n"
+ "Format:[the_top_ith_qps_time QPS], [prefix qps_of_this_second].");
+DEFINE_bool(convert_to_human_readable_trace, false,
+ "Convert the binary trace file to a human readable txt file "
+ "for further processing. "
+ "This file will be extremely large "
+ "(similar size as the original binary trace file). "
+ "You can specify 'no_key' to reduce the size, if key is not "
+ "needed in the next step.\n"
+ "File name: <prefix>_human_readable_trace.txt\n"
+ "Format:[type_id cf_id value_size time_in_micorsec <key>].");
+DEFINE_bool(output_qps_stats, false,
+ "Output the query per second(qps) statistics \n"
+ "For the overall qps, it will contain all qps of each query type. "
+ "The time is started from the first trace record\n"
+ "File name: <prefix>_qps_stats.txt\n"
+ "Format: [qps_type_1 qps_type_2 ...... overall_qps]\n"
+ "For each cf and query, it will have its own qps output.\n"
+ "File name: <prefix>-<query_type>-<cf_id>_qps_stats.txt \n"
+ "Format:[query_count_in_this_second].");
+DEFINE_bool(no_print, false, "Do not print out any result");
+DEFINE_string(
+ print_correlation, "",
+ "intput format: [correlation pairs][.,.]\n"
+ "Output the query correlations between the pairs of query types "
+ "listed in the parameter, input should select the operations from:\n"
+ "get, put, delete, single_delete, rangle_delete, merge. No space "
+ "between the pairs separated by commar. Example: =[get,get]... "
+ "It will print out the number of pairs of 'A after B' and "
+ "the average time interval between the two query.");
+DEFINE_string(key_space_dir, "",
+ "<the directory stores full key space files> \n"
+ "The key space files should be: <column family id>.txt");
+DEFINE_bool(analyze_get, false, "Analyze the Get query.");
+DEFINE_bool(analyze_put, false, "Analyze the Put query.");
+DEFINE_bool(analyze_delete, false, "Analyze the Delete query.");
+DEFINE_bool(analyze_single_delete, false, "Analyze the SingleDelete query.");
+DEFINE_bool(analyze_range_delete, false, "Analyze the DeleteRange query.");
+DEFINE_bool(analyze_merge, false, "Analyze the Merge query.");
+DEFINE_bool(analyze_iterator, false,
+ " Analyze the iterate query like seek() and seekForPrev().");
+DEFINE_bool(no_key, false,
+ " Does not output the key to the result files to make smaller.");
+DEFINE_bool(print_overall_stats, true,
+ " Print the stats of the whole trace, "
+ "like total requests, keys, and etc.");
+DEFINE_bool(output_key_distribution, false, "Print the key size distribution.");
+DEFINE_bool(
+ output_value_distribution, false,
+ "Out put the value size distribution, only available for Put and Merge.\n"
+ "File name: <prefix>-<query_type>-<cf_id>"
+ "-accessed_value_size_distribution.txt\n"
+ "Format:[Number_of_value_size_between x and "
+ "x+value_interval is: <the count>]");
+DEFINE_int32(print_top_k_access, 1,
+ "<top K of the variables to be printed> "
+ "Print the top k accessed keys, top k accessed prefix "
+ "and etc.");
+DEFINE_int32(output_ignore_count, 0,
+ "<threshold>, ignores the access count <= this value, "
+ "it will shorter the output.");
+DEFINE_int32(value_interval, 8,
+ "To output the value distribution, we need to set the value "
+ "intervals and make the statistic of the value size distribution "
+ "in different intervals. The default is 8.");
+DEFINE_double(sample_ratio, 1.0,
+ "If the trace size is extremely huge or user want to sample "
+ "the trace when analyzing, sample ratio can be set (0, 1.0]");
+
+namespace ROCKSDB_NAMESPACE {
+
+std::map<std::string, int> taOptToIndex = {
+ {"get", 0}, {"put", 1},
+ {"delete", 2}, {"single_delete", 3},
+ {"range_delete", 4}, {"merge", 5},
+ {"iterator_Seek", 6}, {"iterator_SeekForPrev", 7}};
+
+std::map<int, std::string> taIndexToOpt = {
+ {0, "get"}, {1, "put"},
+ {2, "delete"}, {3, "single_delete"},
+ {4, "range_delete"}, {5, "merge"},
+ {6, "iterator_Seek"}, {7, "iterator_SeekForPrev"}};
+
+namespace {
+
+uint64_t MultiplyCheckOverflow(uint64_t op1, uint64_t op2) {
+ if (op1 == 0 || op2 == 0) {
+ return 0;
+ }
+ if (port::kMaxUint64 / op1 < op2) {
+ return op1;
+ }
+ return (op1 * op2);
+}
+
+void DecodeCFAndKeyFromString(std::string& buffer, uint32_t* cf_id, Slice* key) {
+ Slice buf(buffer);
+ GetFixed32(&buf, cf_id);
+ GetLengthPrefixedSlice(&buf, key);
+}
+
+} // namespace
+
+// The default constructor of AnalyzerOptions
+AnalyzerOptions::AnalyzerOptions()
+ : correlation_map(kTaTypeNum, std::vector<int>(kTaTypeNum, -1)) {}
+
+AnalyzerOptions::~AnalyzerOptions() {}
+
+void AnalyzerOptions::SparseCorrelationInput(const std::string& in_str) {
+ std::string cur = in_str;
+ if (cur.size() == 0) {
+ return;
+ }
+ while (!cur.empty()) {
+ if (cur.compare(0, 1, "[") != 0) {
+ fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+ exit(1);
+ }
+ std::string opt1, opt2;
+ std::size_t split = cur.find_first_of(",");
+ if (split != std::string::npos) {
+ opt1 = cur.substr(1, split - 1);
+ } else {
+ fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+ exit(1);
+ }
+ std::size_t end = cur.find_first_of("]");
+ if (end != std::string::npos) {
+ opt2 = cur.substr(split + 1, end - split - 1);
+ } else {
+ fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+ exit(1);
+ }
+ cur = cur.substr(end + 1);
+
+ if (taOptToIndex.find(opt1) != taOptToIndex.end() &&
+ taOptToIndex.find(opt2) != taOptToIndex.end()) {
+ correlation_list.push_back(
+ std::make_pair(taOptToIndex[opt1], taOptToIndex[opt2]));
+ } else {
+ fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+ exit(1);
+ }
+ }
+
+ int sequence = 0;
+ for (auto& it : correlation_list) {
+ correlation_map[it.first][it.second] = sequence;
+ sequence++;
+ }
+ return;
+}
+
+// The trace statistic struct constructor
+TraceStats::TraceStats() {
+ cf_id = 0;
+ cf_name = "0";
+ a_count = 0;
+ a_key_id = 0;
+ a_key_size_sqsum = 0;
+ a_key_size_sum = 0;
+ a_key_mid = 0;
+ a_value_size_sqsum = 0;
+ a_value_size_sum = 0;
+ a_value_mid = 0;
+ a_peak_qps = 0;
+ a_ave_qps = 0.0;
+}
+
+TraceStats::~TraceStats() {}
+
+// The trace analyzer constructor
+TraceAnalyzer::TraceAnalyzer(std::string& trace_path, std::string& output_path,
+ AnalyzerOptions _analyzer_opts)
+ : trace_name_(trace_path),
+ output_path_(output_path),
+ analyzer_opts_(_analyzer_opts) {
+ ROCKSDB_NAMESPACE::EnvOptions env_options;
+ env_ = ROCKSDB_NAMESPACE::Env::Default();
+ offset_ = 0;
+ c_time_ = 0;
+ total_requests_ = 0;
+ total_access_keys_ = 0;
+ total_gets_ = 0;
+ total_writes_ = 0;
+ trace_create_time_ = 0;
+ begin_time_ = 0;
+ end_time_ = 0;
+ time_series_start_ = 0;
+ cur_time_sec_ = 0;
+ if (FLAGS_sample_ratio > 1.0 || FLAGS_sample_ratio <= 0) {
+ sample_max_ = 1;
+ } else {
+ sample_max_ = static_cast<uint32_t>(1.0 / FLAGS_sample_ratio);
+ }
+
+ ta_.resize(kTaTypeNum);
+ ta_[0].type_name = "get";
+ if (FLAGS_analyze_get) {
+ ta_[0].enabled = true;
+ } else {
+ ta_[0].enabled = false;
+ }
+ ta_[1].type_name = "put";
+ if (FLAGS_analyze_put) {
+ ta_[1].enabled = true;
+ } else {
+ ta_[1].enabled = false;
+ }
+ ta_[2].type_name = "delete";
+ if (FLAGS_analyze_delete) {
+ ta_[2].enabled = true;
+ } else {
+ ta_[2].enabled = false;
+ }
+ ta_[3].type_name = "single_delete";
+ if (FLAGS_analyze_single_delete) {
+ ta_[3].enabled = true;
+ } else {
+ ta_[3].enabled = false;
+ }
+ ta_[4].type_name = "range_delete";
+ if (FLAGS_analyze_range_delete) {
+ ta_[4].enabled = true;
+ } else {
+ ta_[4].enabled = false;
+ }
+ ta_[5].type_name = "merge";
+ if (FLAGS_analyze_merge) {
+ ta_[5].enabled = true;
+ } else {
+ ta_[5].enabled = false;
+ }
+ ta_[6].type_name = "iterator_Seek";
+ if (FLAGS_analyze_iterator) {
+ ta_[6].enabled = true;
+ } else {
+ ta_[6].enabled = false;
+ }
+ ta_[7].type_name = "iterator_SeekForPrev";
+ if (FLAGS_analyze_iterator) {
+ ta_[7].enabled = true;
+ } else {
+ ta_[7].enabled = false;
+ }
+ for (int i = 0; i < kTaTypeNum; i++) {
+ ta_[i].sample_count = 0;
+ }
+}
+
+TraceAnalyzer::~TraceAnalyzer() {}
+
+// Prepare the processing
+// Initiate the global trace reader and writer here
+Status TraceAnalyzer::PrepareProcessing() {
+ Status s;
+ // Prepare the trace reader
+ s = NewFileTraceReader(env_, env_options_, trace_name_, &trace_reader_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Prepare and open the trace sequence file writer if needed
+ if (FLAGS_convert_to_human_readable_trace) {
+ std::string trace_sequence_name;
+ trace_sequence_name =
+ output_path_ + "/" + FLAGS_output_prefix + "-human_readable_trace.txt";
+ s = env_->NewWritableFile(trace_sequence_name, &trace_sequence_f_,
+ env_options_);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // prepare the general QPS file writer
+ if (FLAGS_output_qps_stats) {
+ std::string qps_stats_name;
+ qps_stats_name =
+ output_path_ + "/" + FLAGS_output_prefix + "-qps_stats.txt";
+ s = env_->NewWritableFile(qps_stats_name, &qps_f_, env_options_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ qps_stats_name =
+ output_path_ + "/" + FLAGS_output_prefix + "-cf_qps_stats.txt";
+ s = env_->NewWritableFile(qps_stats_name, &cf_qps_f_, env_options_);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ return Status::OK();
+}
+
+Status TraceAnalyzer::ReadTraceHeader(Trace* header) {
+ assert(header != nullptr);
+ Status s = ReadTraceRecord(header);
+ if (!s.ok()) {
+ return s;
+ }
+ if (header->type != kTraceBegin) {
+ return Status::Corruption("Corrupted trace file. Incorrect header.");
+ }
+ if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) {
+ return Status::Corruption("Corrupted trace file. Incorrect magic.");
+ }
+
+ return s;
+}
+
+Status TraceAnalyzer::ReadTraceFooter(Trace* footer) {
+ assert(footer != nullptr);
+ Status s = ReadTraceRecord(footer);
+ if (!s.ok()) {
+ return s;
+ }
+ if (footer->type != kTraceEnd) {
+ return Status::Corruption("Corrupted trace file. Incorrect footer.");
+ }
+ return s;
+}
+
+Status TraceAnalyzer::ReadTraceRecord(Trace* trace) {
+ assert(trace != nullptr);
+ std::string encoded_trace;
+ Status s = trace_reader_->Read(&encoded_trace);
+ if (!s.ok()) {
+ return s;
+ }
+
+ Slice enc_slice = Slice(encoded_trace);
+ GetFixed64(&enc_slice, &trace->ts);
+ trace->type = static_cast<TraceType>(enc_slice[0]);
+ enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize);
+ trace->payload = enc_slice.ToString();
+ return s;
+}
+
+// process the trace itself and redirect the trace content
+// to different operation type handler. With different race
+// format, this function can be changed
+Status TraceAnalyzer::StartProcessing() {
+ Status s;
+ Trace header;
+ s = ReadTraceHeader(&header);
+ if (!s.ok()) {
+ fprintf(stderr, "Cannot read the header\n");
+ return s;
+ }
+ trace_create_time_ = header.ts;
+ if (FLAGS_output_time_series) {
+ time_series_start_ = header.ts;
+ }
+
+ Trace trace;
+ while (s.ok()) {
+ trace.reset();
+ s = ReadTraceRecord(&trace);
+ if (!s.ok()) {
+ break;
+ }
+
+ total_requests_++;
+ end_time_ = trace.ts;
+ if (trace.type == kTraceWrite) {
+ total_writes_++;
+ c_time_ = trace.ts;
+ WriteBatch batch(trace.payload);
+
+ // Note that, if the write happens in a transaction,
+ // 'Write' will be called twice, one for Prepare, one for
+ // Commit. Thus, in the trace, for the same WriteBatch, there
+ // will be two reords if it is in a transaction. Here, we only
+ // process the reord that is committed. If write is non-transaction,
+ // HasBeginPrepare()==false, so we process it normally.
+ if (batch.HasBeginPrepare() && !batch.HasCommit()) {
+ continue;
+ }
+ TraceWriteHandler write_handler(this);
+ s = batch.Iterate(&write_handler);
+ if (!s.ok()) {
+ fprintf(stderr, "Cannot process the write batch in the trace\n");
+ return s;
+ }
+ } else if (trace.type == kTraceGet) {
+ uint32_t cf_id = 0;
+ Slice key;
+ DecodeCFAndKeyFromString(trace.payload, &cf_id, &key);
+ total_gets_++;
+
+ s = HandleGet(cf_id, key.ToString(), trace.ts, 1);
+ if (!s.ok()) {
+ fprintf(stderr, "Cannot process the get in the trace\n");
+ return s;
+ }
+ } else if (trace.type == kTraceIteratorSeek ||
+ trace.type == kTraceIteratorSeekForPrev) {
+ uint32_t cf_id = 0;
+ Slice key;
+ DecodeCFAndKeyFromString(trace.payload, &cf_id, &key);
+ s = HandleIter(cf_id, key.ToString(), trace.ts, trace.type);
+ if (!s.ok()) {
+ fprintf(stderr, "Cannot process the iterator in the trace\n");
+ return s;
+ }
+ } else if (trace.type == kTraceEnd) {
+ break;
+ }
+ }
+ if (s.IsIncomplete()) {
+ // Fix it: Reaching eof returns Incomplete status at the moment.
+ //
+ return Status::OK();
+ }
+ return s;
+}
+
+// After the trace is processed by StartProcessing, the statistic data
+// is stored in the map or other in memory data structures. To get the
+// other statistic result such as key size distribution, value size
+// distribution, these data structures are re-processed here.
+Status TraceAnalyzer::MakeStatistics() {
+ int ret;
+ Status s;
+ for (int type = 0; type < kTaTypeNum; type++) {
+ if (!ta_[type].enabled) {
+ continue;
+ }
+ for (auto& stat : ta_[type].stats) {
+ stat.second.a_key_id = 0;
+ for (auto& record : stat.second.a_key_stats) {
+ record.second.key_id = stat.second.a_key_id;
+ stat.second.a_key_id++;
+ if (record.second.access_count <=
+ static_cast<uint64_t>(FLAGS_output_ignore_count)) {
+ continue;
+ }
+
+ // Generate the key access count distribution data
+ if (FLAGS_output_access_count_stats) {
+ if (stat.second.a_count_stats.find(record.second.access_count) ==
+ stat.second.a_count_stats.end()) {
+ stat.second.a_count_stats[record.second.access_count] = 1;
+ } else {
+ stat.second.a_count_stats[record.second.access_count]++;
+ }
+ }
+
+ // Generate the key size distribution data
+ if (FLAGS_output_key_distribution) {
+ if (stat.second.a_key_size_stats.find(record.first.size()) ==
+ stat.second.a_key_size_stats.end()) {
+ stat.second.a_key_size_stats[record.first.size()] = 1;
+ } else {
+ stat.second.a_key_size_stats[record.first.size()]++;
+ }
+ }
+
+ if (!FLAGS_print_correlation.empty()) {
+ s = MakeStatisticCorrelation(stat.second, record.second);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+
+ // Output the prefix cut or the whole content of the accessed key space
+ if (FLAGS_output_key_stats || FLAGS_output_prefix_cut > 0) {
+ s = MakeStatisticKeyStatsOrPrefix(stat.second);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ // output the access count distribution
+ if (FLAGS_output_access_count_stats && stat.second.a_count_dist_f) {
+ for (auto& record : stat.second.a_count_stats) {
+ ret = snprintf(buffer_, sizeof(buffer_),
+ "access_count: %" PRIu64 " num: %" PRIu64 "\n",
+ record.first, record.second);
+ if (ret < 0) {
+ return Status::IOError("Format the output failed");
+ }
+ std::string printout(buffer_);
+ s = stat.second.a_count_dist_f->Append(printout);
+ if (!s.ok()) {
+ fprintf(stderr, "Write access count distribution file failed\n");
+ return s;
+ }
+ }
+ }
+
+ // find the medium of the key size
+ uint64_t k_count = 0;
+ bool get_mid = false;
+ for (auto& record : stat.second.a_key_size_stats) {
+ k_count += record.second;
+ if (!get_mid && k_count >= stat.second.a_key_mid) {
+ stat.second.a_key_mid = record.first;
+ get_mid = true;
+ }
+ if (FLAGS_output_key_distribution && stat.second.a_key_size_f) {
+ ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 "\n",
+ record.first, record.second);
+ if (ret < 0) {
+ return Status::IOError("Format output failed");
+ }
+ std::string printout(buffer_);
+ s = stat.second.a_key_size_f->Append(printout);
+ if (!s.ok()) {
+ fprintf(stderr, "Write key size distribution file failed\n");
+ return s;
+ }
+ }
+ }
+
+ // output the value size distribution
+ uint64_t v_begin = 0, v_end = 0, v_count = 0;
+ get_mid = false;
+ for (auto& record : stat.second.a_value_size_stats) {
+ v_begin = v_end;
+ v_end = (record.first + 1) * FLAGS_value_interval;
+ v_count += record.second;
+ if (!get_mid && v_count >= stat.second.a_count / 2) {
+ stat.second.a_value_mid = (v_begin + v_end) / 2;
+ get_mid = true;
+ }
+ if (FLAGS_output_value_distribution && stat.second.a_value_size_f &&
+ (type == TraceOperationType::kPut ||
+ type == TraceOperationType::kMerge)) {
+ ret = snprintf(buffer_, sizeof(buffer_),
+ "Number_of_value_size_between %" PRIu64 " and %" PRIu64
+ " is: %" PRIu64 "\n",
+ v_begin, v_end, record.second);
+ if (ret < 0) {
+ return Status::IOError("Format output failed");
+ }
+ std::string printout(buffer_);
+ s = stat.second.a_value_size_f->Append(printout);
+ if (!s.ok()) {
+ fprintf(stderr, "Write value size distribution file failed\n");
+ return s;
+ }
+ }
+ }
+ }
+ }
+
+ // Make the QPS statistics
+ if (FLAGS_output_qps_stats) {
+ s = MakeStatisticQPS();
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ return Status::OK();
+}
+
+// Process the statistics of the key access and
+// prefix of the accessed keys if required
+Status TraceAnalyzer::MakeStatisticKeyStatsOrPrefix(TraceStats& stats) {
+ int ret;
+ Status s;
+ std::string prefix = "0";
+ uint64_t prefix_access = 0;
+ uint64_t prefix_count = 0;
+ uint64_t prefix_succ_access = 0;
+ double prefix_ave_access = 0.0;
+ stats.a_succ_count = 0;
+ for (auto& record : stats.a_key_stats) {
+ // write the key access statistic file
+ if (!stats.a_key_f) {
+ return Status::IOError("Failed to open accessed_key_stats file.");
+ }
+ stats.a_succ_count += record.second.succ_count;
+ double succ_ratio = 0.0;
+ if (record.second.access_count > 0) {
+ succ_ratio = (static_cast<double>(record.second.succ_count)) /
+ record.second.access_count;
+ }
+ ret = snprintf(buffer_, sizeof(buffer_),
+ "%u %zu %" PRIu64 " %" PRIu64 " %f\n", record.second.cf_id,
+ record.second.value_size, record.second.key_id,
+ record.second.access_count, succ_ratio);
+ if (ret < 0) {
+ return Status::IOError("Format output failed");
+ }
+ std::string printout(buffer_);
+ s = stats.a_key_f->Append(printout);
+ if (!s.ok()) {
+ fprintf(stderr, "Write key access file failed\n");
+ return s;
+ }
+
+ // write the prefix cut of the accessed keys
+ if (FLAGS_output_prefix_cut > 0 && stats.a_prefix_cut_f) {
+ if (record.first.compare(0, FLAGS_output_prefix_cut, prefix) != 0) {
+ std::string prefix_out =
+ ROCKSDB_NAMESPACE::LDBCommand::StringToHex(prefix);
+ if (prefix_count == 0) {
+ prefix_ave_access = 0.0;
+ } else {
+ prefix_ave_access =
+ (static_cast<double>(prefix_access)) / prefix_count;
+ }
+ double prefix_succ_ratio = 0.0;
+ if (prefix_access > 0) {
+ prefix_succ_ratio =
+ (static_cast<double>(prefix_succ_access)) / prefix_access;
+ }
+ ret =
+ snprintf(buffer_, sizeof(buffer_),
+ "%" PRIu64 " %" PRIu64 " %" PRIu64 " %f %f %s\n",
+ record.second.key_id, prefix_access, prefix_count,
+ prefix_ave_access, prefix_succ_ratio, prefix_out.c_str());
+ if (ret < 0) {
+ return Status::IOError("Format output failed");
+ }
+ std::string pout(buffer_);
+ s = stats.a_prefix_cut_f->Append(pout);
+ if (!s.ok()) {
+ fprintf(stderr, "Write accessed key prefix file failed\n");
+ return s;
+ }
+
+ // make the top k statistic for the prefix
+ if (static_cast<int32_t>(stats.top_k_prefix_access.size()) <
+ FLAGS_print_top_k_access) {
+ stats.top_k_prefix_access.push(
+ std::make_pair(prefix_access, prefix_out));
+ } else {
+ if (prefix_access > stats.top_k_prefix_access.top().first) {
+ stats.top_k_prefix_access.pop();
+ stats.top_k_prefix_access.push(
+ std::make_pair(prefix_access, prefix_out));
+ }
+ }
+
+ if (static_cast<int32_t>(stats.top_k_prefix_ave.size()) <
+ FLAGS_print_top_k_access) {
+ stats.top_k_prefix_ave.push(
+ std::make_pair(prefix_ave_access, prefix_out));
+ } else {
+ if (prefix_ave_access > stats.top_k_prefix_ave.top().first) {
+ stats.top_k_prefix_ave.pop();
+ stats.top_k_prefix_ave.push(
+ std::make_pair(prefix_ave_access, prefix_out));
+ }
+ }
+
+ prefix = record.first.substr(0, FLAGS_output_prefix_cut);
+ prefix_access = 0;
+ prefix_count = 0;
+ prefix_succ_access = 0;
+ }
+ prefix_access += record.second.access_count;
+ prefix_count += 1;
+ prefix_succ_access += record.second.succ_count;
+ }
+ }
+ return Status::OK();
+}
+
+// Process the statistics of different query type
+// correlations
+Status TraceAnalyzer::MakeStatisticCorrelation(TraceStats& stats,
+ StatsUnit& unit) {
+ if (stats.correlation_output.size() !=
+ analyzer_opts_.correlation_list.size()) {
+ return Status::Corruption("Cannot make the statistic of correlation.");
+ }
+
+ for (int i = 0; i < static_cast<int>(analyzer_opts_.correlation_list.size());
+ i++) {
+ if (i >= static_cast<int>(stats.correlation_output.size()) ||
+ i >= static_cast<int>(unit.v_correlation.size())) {
+ break;
+ }
+ stats.correlation_output[i].first += unit.v_correlation[i].count;
+ stats.correlation_output[i].second += unit.v_correlation[i].total_ts;
+ }
+ return Status::OK();
+}
+
+// Process the statistics of QPS
+Status TraceAnalyzer::MakeStatisticQPS() {
+ if(begin_time_ == 0) {
+ begin_time_ = trace_create_time_;
+ }
+ uint32_t duration =
+ static_cast<uint32_t>((end_time_ - begin_time_) / 1000000);
+ int ret;
+ Status s;
+ std::vector<std::vector<uint32_t>> type_qps(
+ duration, std::vector<uint32_t>(kTaTypeNum + 1, 0));
+ std::vector<uint64_t> qps_sum(kTaTypeNum + 1, 0);
+ std::vector<uint32_t> qps_peak(kTaTypeNum + 1, 0);
+ qps_ave_.resize(kTaTypeNum + 1);
+
+ for (int type = 0; type < kTaTypeNum; type++) {
+ if (!ta_[type].enabled) {
+ continue;
+ }
+ for (auto& stat : ta_[type].stats) {
+ uint32_t time_line = 0;
+ uint64_t cf_qps_sum = 0;
+ for (auto& time_it : stat.second.a_qps_stats) {
+ if (time_it.first >= duration) {
+ continue;
+ }
+ type_qps[time_it.first][kTaTypeNum] += time_it.second;
+ type_qps[time_it.first][type] += time_it.second;
+ cf_qps_sum += time_it.second;
+ if (time_it.second > stat.second.a_peak_qps) {
+ stat.second.a_peak_qps = time_it.second;
+ }
+ if (stat.second.a_qps_f) {
+ while (time_line < time_it.first) {
+ ret = snprintf(buffer_, sizeof(buffer_), "%u\n", 0);
+ if (ret < 0) {
+ return Status::IOError("Format the output failed");
+ }
+ std::string printout(buffer_);
+ s = stat.second.a_qps_f->Append(printout);
+ if (!s.ok()) {
+ fprintf(stderr, "Write QPS file failed\n");
+ return s;
+ }
+ time_line++;
+ }
+ ret = snprintf(buffer_, sizeof(buffer_), "%u\n", time_it.second);
+ if (ret < 0) {
+ return Status::IOError("Format the output failed");
+ }
+ std::string printout(buffer_);
+ s = stat.second.a_qps_f->Append(printout);
+ if (!s.ok()) {
+ fprintf(stderr, "Write QPS file failed\n");
+ return s;
+ }
+ if (time_line == time_it.first) {
+ time_line++;
+ }
+ }
+
+ // Process the top k QPS peaks
+ if (FLAGS_output_prefix_cut > 0) {
+ if (static_cast<int32_t>(stat.second.top_k_qps_sec.size()) <
+ FLAGS_print_top_k_access) {
+ stat.second.top_k_qps_sec.push(
+ std::make_pair(time_it.second, time_it.first));
+ } else {
+ if (stat.second.top_k_qps_sec.size() > 0 &&
+ stat.second.top_k_qps_sec.top().first < time_it.second) {
+ stat.second.top_k_qps_sec.pop();
+ stat.second.top_k_qps_sec.push(
+ std::make_pair(time_it.second, time_it.first));
+ }
+ }
+ }
+ }
+ if (duration == 0) {
+ stat.second.a_ave_qps = 0;
+ } else {
+ stat.second.a_ave_qps = (static_cast<double>(cf_qps_sum)) / duration;
+ }
+
+ // Output the accessed unique key number change overtime
+ if (stat.second.a_key_num_f) {
+ uint64_t cur_uni_key =
+ static_cast<uint64_t>(stat.second.a_key_stats.size());
+ double cur_ratio = 0.0;
+ uint64_t cur_num = 0;
+ for (uint32_t i = 0; i < duration; i++) {
+ auto find_time = stat.second.uni_key_num.find(i);
+ if (find_time != stat.second.uni_key_num.end()) {
+ cur_ratio = (static_cast<double>(find_time->second)) / cur_uni_key;
+ cur_num = find_time->second;
+ }
+ ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %.12f\n",
+ cur_num, cur_ratio);
+ if (ret < 0) {
+ return Status::IOError("Format the output failed");
+ }
+ std::string printout(buffer_);
+ s = stat.second.a_key_num_f->Append(printout);
+ if (!s.ok()) {
+ fprintf(stderr,
+ "Write accessed unique key number change file failed\n");
+ return s;
+ }
+ }
+ }
+
+ // output the prefix of top k access peak
+ if (FLAGS_output_prefix_cut > 0 && stat.second.a_top_qps_prefix_f) {
+ while (!stat.second.top_k_qps_sec.empty()) {
+ ret = snprintf(buffer_, sizeof(buffer_), "At time: %u with QPS: %u\n",
+ stat.second.top_k_qps_sec.top().second,
+ stat.second.top_k_qps_sec.top().first);
+ if (ret < 0) {
+ return Status::IOError("Format the output failed");
+ }
+ std::string printout(buffer_);
+ s = stat.second.a_top_qps_prefix_f->Append(printout);
+ if (!s.ok()) {
+ fprintf(stderr, "Write prefix QPS top K file failed\n");
+ return s;
+ }
+ uint32_t qps_time = stat.second.top_k_qps_sec.top().second;
+ stat.second.top_k_qps_sec.pop();
+ if (stat.second.a_qps_prefix_stats.find(qps_time) !=
+ stat.second.a_qps_prefix_stats.end()) {
+ for (auto& qps_prefix : stat.second.a_qps_prefix_stats[qps_time]) {
+ std::string qps_prefix_out =
+ ROCKSDB_NAMESPACE::LDBCommand::StringToHex(qps_prefix.first);
+ ret = snprintf(buffer_, sizeof(buffer_),
+ "The prefix: %s Access count: %u\n",
+ qps_prefix_out.c_str(), qps_prefix.second);
+ if (ret < 0) {
+ return Status::IOError("Format the output failed");
+ }
+ std::string pout(buffer_);
+ s = stat.second.a_top_qps_prefix_f->Append(pout);
+ if (!s.ok()) {
+ fprintf(stderr, "Write prefix QPS top K file failed\n");
+ return s;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (qps_f_) {
+ for (uint32_t i = 0; i < duration; i++) {
+ for (int type = 0; type <= kTaTypeNum; type++) {
+ if (type < kTaTypeNum) {
+ ret = snprintf(buffer_, sizeof(buffer_), "%u ", type_qps[i][type]);
+ } else {
+ ret = snprintf(buffer_, sizeof(buffer_), "%u\n", type_qps[i][type]);
+ }
+ if (ret < 0) {
+ return Status::IOError("Format the output failed");
+ }
+ std::string printout(buffer_);
+ s = qps_f_->Append(printout);
+ if (!s.ok()) {
+ return s;
+ }
+ qps_sum[type] += type_qps[i][type];
+ if (type_qps[i][type] > qps_peak[type]) {
+ qps_peak[type] = type_qps[i][type];
+ }
+ }
+ }
+ }
+
+ if (cf_qps_f_) {
+ int cfs_size = static_cast<uint32_t>(cfs_.size());
+ uint32_t v;
+ for (uint32_t i = 0; i < duration; i++) {
+ for (int cf = 0; cf < cfs_size; cf++) {
+ if (cfs_[cf].cf_qps.find(i) != cfs_[cf].cf_qps.end()) {
+ v = cfs_[cf].cf_qps[i];
+ } else {
+ v = 0;
+ }
+ if (cf < cfs_size - 1) {
+ ret = snprintf(buffer_, sizeof(buffer_), "%u ", v);
+ } else {
+ ret = snprintf(buffer_, sizeof(buffer_), "%u\n", v);
+ }
+ if (ret < 0) {
+ return Status::IOError("Format the output failed");
+ }
+ std::string printout(buffer_);
+ s = cf_qps_f_->Append(printout);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+ }
+
+ qps_peak_ = qps_peak;
+ for (int type = 0; type <= kTaTypeNum; type++) {
+ if (duration == 0) {
+ qps_ave_[type] = 0;
+ } else {
+ qps_ave_[type] = (static_cast<double>(qps_sum[type])) / duration;
+ }
+ }
+
+ return Status::OK();
+}
+
+// In reprocessing, if we have the whole key space
+// we can output the access count of all keys in a cf
+// we can make some statistics of the whole key space
+// also, we output the top k accessed keys here
+Status TraceAnalyzer::ReProcessing() {
+ int ret;
+ Status s;
+ for (auto& cf_it : cfs_) {
+ uint32_t cf_id = cf_it.first;
+
+ // output the time series;
+ if (FLAGS_output_time_series) {
+ for (int type = 0; type < kTaTypeNum; type++) {
+ if (!ta_[type].enabled ||
+ ta_[type].stats.find(cf_id) == ta_[type].stats.end()) {
+ continue;
+ }
+ TraceStats& stat = ta_[type].stats[cf_id];
+ if (!stat.time_series_f) {
+ fprintf(stderr, "Cannot write time_series of '%s' in '%u'\n",
+ ta_[type].type_name.c_str(), cf_id);
+ continue;
+ }
+ while (!stat.time_series.empty()) {
+ uint64_t key_id = 0;
+ auto found = stat.a_key_stats.find(stat.time_series.front().key);
+ if (found != stat.a_key_stats.end()) {
+ key_id = found->second.key_id;
+ }
+ ret =
+ snprintf(buffer_, sizeof(buffer_), "%u %" PRIu64 " %" PRIu64 "\n",
+ stat.time_series.front().type,
+ stat.time_series.front().ts, key_id);
+ if (ret < 0) {
+ return Status::IOError("Format the output failed");
+ }
+ std::string printout(buffer_);
+ s = stat.time_series_f->Append(printout);
+ if (!s.ok()) {
+ fprintf(stderr, "Write time series file failed\n");
+ return s;
+ }
+ stat.time_series.pop_front();
+ }
+ }
+ }
+
+ // process the whole key space if needed
+ if (!FLAGS_key_space_dir.empty()) {
+ std::string whole_key_path =
+ FLAGS_key_space_dir + "/" + std::to_string(cf_id) + ".txt";
+ std::string input_key, get_key;
+ std::vector<std::string> prefix(kTaTypeNum);
+ std::istringstream iss;
+ bool has_data = true;
+ std::unique_ptr<SequentialFile> wkey_input_f;
+
+ s = env_->NewSequentialFile(whole_key_path, &wkey_input_f, env_options_);
+ if (!s.ok()) {
+ fprintf(stderr, "Cannot open the whole key space file of CF: %u\n",
+ cf_id);
+ wkey_input_f.reset();
+ }
+
+ if (wkey_input_f) {
+ std::unique_ptr<FSSequentialFile> file;
+ file = NewLegacySequentialFileWrapper(wkey_input_f);
+ size_t kTraceFileReadaheadSize = 2 * 1024 * 1024;
+ SequentialFileReader sf_reader(
+ std::move(file), whole_key_path,
+ kTraceFileReadaheadSize /* filereadahead_size */);
+ for (cfs_[cf_id].w_count = 0;
+ ReadOneLine(&iss, &sf_reader, &get_key, &has_data, &s);
+ ++cfs_[cf_id].w_count) {
+ if (!s.ok()) {
+ fprintf(stderr, "Read whole key space file failed\n");
+ return s;
+ }
+
+ input_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(get_key);
+ for (int type = 0; type < kTaTypeNum; type++) {
+ if (!ta_[type].enabled) {
+ continue;
+ }
+ TraceStats& stat = ta_[type].stats[cf_id];
+ if (stat.w_key_f) {
+ if (stat.a_key_stats.find(input_key) != stat.a_key_stats.end()) {
+ ret = snprintf(buffer_, sizeof(buffer_),
+ "%" PRIu64 " %" PRIu64 "\n", cfs_[cf_id].w_count,
+ stat.a_key_stats[input_key].access_count);
+ if (ret < 0) {
+ return Status::IOError("Format the output failed");
+ }
+ std::string printout(buffer_);
+ s = stat.w_key_f->Append(printout);
+ if (!s.ok()) {
+ fprintf(stderr, "Write whole key space access file failed\n");
+ return s;
+ }
+ }
+ }
+
+ // Output the prefix cut file of the whole key space
+ if (FLAGS_output_prefix_cut > 0 && stat.w_prefix_cut_f) {
+ if (input_key.compare(0, FLAGS_output_prefix_cut, prefix[type]) !=
+ 0) {
+ prefix[type] = input_key.substr(0, FLAGS_output_prefix_cut);
+ std::string prefix_out =
+ ROCKSDB_NAMESPACE::LDBCommand::StringToHex(prefix[type]);
+ ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %s\n",
+ cfs_[cf_id].w_count, prefix_out.c_str());
+ if (ret < 0) {
+ return Status::IOError("Format the output failed");
+ }
+ std::string printout(buffer_);
+ s = stat.w_prefix_cut_f->Append(printout);
+ if (!s.ok()) {
+ fprintf(stderr,
+ "Write whole key space prefix cut file failed\n");
+ return s;
+ }
+ }
+ }
+ }
+
+ // Make the statistics fo the key size distribution
+ if (FLAGS_output_key_distribution) {
+ if (cfs_[cf_id].w_key_size_stats.find(input_key.size()) ==
+ cfs_[cf_id].w_key_size_stats.end()) {
+ cfs_[cf_id].w_key_size_stats[input_key.size()] = 1;
+ } else {
+ cfs_[cf_id].w_key_size_stats[input_key.size()]++;
+ }
+ }
+ }
+ }
+ }
+
+ // process the top k accessed keys
+ if (FLAGS_print_top_k_access > 0) {
+ for (int type = 0; type < kTaTypeNum; type++) {
+ if (!ta_[type].enabled ||
+ ta_[type].stats.find(cf_id) == ta_[type].stats.end()) {
+ continue;
+ }
+ TraceStats& stat = ta_[type].stats[cf_id];
+ for (auto& record : stat.a_key_stats) {
+ if (static_cast<int32_t>(stat.top_k_queue.size()) <
+ FLAGS_print_top_k_access) {
+ stat.top_k_queue.push(
+ std::make_pair(record.second.access_count, record.first));
+ } else {
+ if (record.second.access_count > stat.top_k_queue.top().first) {
+ stat.top_k_queue.pop();
+ stat.top_k_queue.push(
+ std::make_pair(record.second.access_count, record.first));
+ }
+ }
+ }
+ }
+ }
+ }
+ return Status::OK();
+}
+
+// End the processing, print the requested results
+Status TraceAnalyzer::EndProcessing() {
+ if (trace_sequence_f_) {
+ trace_sequence_f_->Close();
+ }
+ if (FLAGS_no_print) {
+ return Status::OK();
+ }
+ PrintStatistics();
+ CloseOutputFiles();
+ return Status::OK();
+}
+
+// Insert the corresponding key statistics to the correct type
+// and correct CF, output the time-series file if needed
+Status TraceAnalyzer::KeyStatsInsertion(const uint32_t& type,
+ const uint32_t& cf_id,
+ const std::string& key,
+ const size_t value_size,
+ const uint64_t ts) {
+ Status s;
+ StatsUnit unit;
+ unit.key_id = 0;
+ unit.cf_id = cf_id;
+ unit.value_size = value_size;
+ unit.access_count = 1;
+ unit.latest_ts = ts;
+ if (type != TraceOperationType::kGet || value_size > 0) {
+ unit.succ_count = 1;
+ } else {
+ unit.succ_count = 0;
+ }
+ unit.v_correlation.resize(analyzer_opts_.correlation_list.size());
+ for (int i = 0;
+ i < (static_cast<int>(analyzer_opts_.correlation_list.size())); i++) {
+ unit.v_correlation[i].count = 0;
+ unit.v_correlation[i].total_ts = 0;
+ }
+ std::string prefix;
+ if (FLAGS_output_prefix_cut > 0) {
+ prefix = key.substr(0, FLAGS_output_prefix_cut);
+ }
+
+ if (begin_time_ == 0) {
+ begin_time_ = ts;
+ }
+ uint32_t time_in_sec;
+ if (ts < begin_time_) {
+ time_in_sec = 0;
+ } else {
+ time_in_sec = static_cast<uint32_t>((ts - begin_time_) / 1000000);
+ }
+
+ uint64_t dist_value_size = value_size / FLAGS_value_interval;
+ auto found_stats = ta_[type].stats.find(cf_id);
+ if (found_stats == ta_[type].stats.end()) {
+ ta_[type].stats[cf_id].cf_id = cf_id;
+ ta_[type].stats[cf_id].cf_name = std::to_string(cf_id);
+ ta_[type].stats[cf_id].a_count = 1;
+ ta_[type].stats[cf_id].a_key_id = 0;
+ ta_[type].stats[cf_id].a_key_size_sqsum = MultiplyCheckOverflow(
+ static_cast<uint64_t>(key.size()), static_cast<uint64_t>(key.size()));
+ ta_[type].stats[cf_id].a_key_size_sum = key.size();
+ ta_[type].stats[cf_id].a_value_size_sqsum = MultiplyCheckOverflow(
+ static_cast<uint64_t>(value_size), static_cast<uint64_t>(value_size));
+ ta_[type].stats[cf_id].a_value_size_sum = value_size;
+ s = OpenStatsOutputFiles(ta_[type].type_name, ta_[type].stats[cf_id]);
+ if (!FLAGS_print_correlation.empty()) {
+ s = StatsUnitCorrelationUpdate(unit, type, ts, key);
+ }
+ ta_[type].stats[cf_id].a_key_stats[key] = unit;
+ ta_[type].stats[cf_id].a_value_size_stats[dist_value_size] = 1;
+ ta_[type].stats[cf_id].a_qps_stats[time_in_sec] = 1;
+ ta_[type].stats[cf_id].correlation_output.resize(
+ analyzer_opts_.correlation_list.size());
+ if (FLAGS_output_prefix_cut > 0) {
+ std::map<std::string, uint32_t> tmp_qps_map;
+ tmp_qps_map[prefix] = 1;
+ ta_[type].stats[cf_id].a_qps_prefix_stats[time_in_sec] = tmp_qps_map;
+ }
+ if (time_in_sec != cur_time_sec_) {
+ ta_[type].stats[cf_id].uni_key_num[cur_time_sec_] =
+ static_cast<uint64_t>(ta_[type].stats[cf_id].a_key_stats.size());
+ cur_time_sec_ = time_in_sec;
+ }
+ } else {
+ found_stats->second.a_count++;
+ found_stats->second.a_key_size_sqsum += MultiplyCheckOverflow(
+ static_cast<uint64_t>(key.size()), static_cast<uint64_t>(key.size()));
+ found_stats->second.a_key_size_sum += key.size();
+ found_stats->second.a_value_size_sqsum += MultiplyCheckOverflow(
+ static_cast<uint64_t>(value_size), static_cast<uint64_t>(value_size));
+ found_stats->second.a_value_size_sum += value_size;
+ auto found_key = found_stats->second.a_key_stats.find(key);
+ if (found_key == found_stats->second.a_key_stats.end()) {
+ found_stats->second.a_key_stats[key] = unit;
+ } else {
+ found_key->second.access_count++;
+ if (type != TraceOperationType::kGet || value_size > 0) {
+ found_key->second.succ_count++;
+ }
+ if (!FLAGS_print_correlation.empty()) {
+ s = StatsUnitCorrelationUpdate(found_key->second, type, ts, key);
+ }
+ }
+ if (time_in_sec != cur_time_sec_) {
+ found_stats->second.uni_key_num[cur_time_sec_] =
+ static_cast<uint64_t>(found_stats->second.a_key_stats.size());
+ cur_time_sec_ = time_in_sec;
+ }
+
+ auto found_value =
+ found_stats->second.a_value_size_stats.find(dist_value_size);
+ if (found_value == found_stats->second.a_value_size_stats.end()) {
+ found_stats->second.a_value_size_stats[dist_value_size] = 1;
+ } else {
+ found_value->second++;
+ }
+
+ auto found_qps = found_stats->second.a_qps_stats.find(time_in_sec);
+ if (found_qps == found_stats->second.a_qps_stats.end()) {
+ found_stats->second.a_qps_stats[time_in_sec] = 1;
+ } else {
+ found_qps->second++;
+ }
+
+ if (FLAGS_output_prefix_cut > 0) {
+ auto found_qps_prefix =
+ found_stats->second.a_qps_prefix_stats.find(time_in_sec);
+ if (found_qps_prefix == found_stats->second.a_qps_prefix_stats.end()) {
+ std::map<std::string, uint32_t> tmp_qps_map;
+ found_stats->second.a_qps_prefix_stats[time_in_sec] = tmp_qps_map;
+ }
+ if (found_stats->second.a_qps_prefix_stats[time_in_sec].find(prefix) ==
+ found_stats->second.a_qps_prefix_stats[time_in_sec].end()) {
+ found_stats->second.a_qps_prefix_stats[time_in_sec][prefix] = 1;
+ } else {
+ found_stats->second.a_qps_prefix_stats[time_in_sec][prefix]++;
+ }
+ }
+ }
+
+ if (cfs_.find(cf_id) == cfs_.end()) {
+ CfUnit cf_unit;
+ cf_unit.cf_id = cf_id;
+ cf_unit.w_count = 0;
+ cf_unit.a_count = 0;
+ cfs_[cf_id] = cf_unit;
+ }
+
+ if (FLAGS_output_qps_stats) {
+ cfs_[cf_id].cf_qps[time_in_sec]++;
+ }
+
+ if (FLAGS_output_time_series) {
+ TraceUnit trace_u;
+ trace_u.type = type;
+ trace_u.key = key;
+ trace_u.value_size = value_size;
+ trace_u.ts = (ts - time_series_start_) / 1000000;
+ trace_u.cf_id = cf_id;
+ ta_[type].stats[cf_id].time_series.push_back(trace_u);
+ }
+
+ return Status::OK();
+}
+
+// Update the correlation unit of each key if enabled
+Status TraceAnalyzer::StatsUnitCorrelationUpdate(StatsUnit& unit,
+ const uint32_t& type_second,
+ const uint64_t& ts,
+ const std::string& key) {
+ if (type_second >= kTaTypeNum) {
+ fprintf(stderr, "Unknown Type Id: %u\n", type_second);
+ return Status::NotFound();
+ }
+
+ for (int type_first = 0; type_first < kTaTypeNum; type_first++) {
+ if (type_first >= static_cast<int>(ta_.size()) ||
+ type_first >= static_cast<int>(analyzer_opts_.correlation_map.size())) {
+ break;
+ }
+ if (analyzer_opts_.correlation_map[type_first][type_second] < 0 ||
+ ta_[type_first].stats.find(unit.cf_id) == ta_[type_first].stats.end() ||
+ ta_[type_first].stats[unit.cf_id].a_key_stats.find(key) ==
+ ta_[type_first].stats[unit.cf_id].a_key_stats.end() ||
+ ta_[type_first].stats[unit.cf_id].a_key_stats[key].latest_ts == ts) {
+ continue;
+ }
+
+ int correlation_id =
+ analyzer_opts_.correlation_map[type_first][type_second];
+
+ // after get the x-y operation time or x, update;
+ if (correlation_id < 0 ||
+ correlation_id >= static_cast<int>(unit.v_correlation.size())) {
+ continue;
+ }
+ unit.v_correlation[correlation_id].count++;
+ unit.v_correlation[correlation_id].total_ts +=
+ (ts - ta_[type_first].stats[unit.cf_id].a_key_stats[key].latest_ts);
+ }
+
+ unit.latest_ts = ts;
+ return Status::OK();
+}
+
+// when a new trace statistic is created, the file handler
+// pointers should be initiated if needed according to
+// the trace analyzer options
+Status TraceAnalyzer::OpenStatsOutputFiles(const std::string& type,
+ TraceStats& new_stats) {
+ Status s;
+ if (FLAGS_output_key_stats) {
+ s = CreateOutputFile(type, new_stats.cf_name, "accessed_key_stats.txt",
+ &new_stats.a_key_f);
+ s = CreateOutputFile(type, new_stats.cf_name,
+ "accessed_unique_key_num_change.txt",
+ &new_stats.a_key_num_f);
+ if (!FLAGS_key_space_dir.empty()) {
+ s = CreateOutputFile(type, new_stats.cf_name, "whole_key_stats.txt",
+ &new_stats.w_key_f);
+ }
+ }
+
+ if (FLAGS_output_access_count_stats) {
+ s = CreateOutputFile(type, new_stats.cf_name,
+ "accessed_key_count_distribution.txt",
+ &new_stats.a_count_dist_f);
+ }
+
+ if (FLAGS_output_prefix_cut > 0) {
+ s = CreateOutputFile(type, new_stats.cf_name, "accessed_key_prefix_cut.txt",
+ &new_stats.a_prefix_cut_f);
+ if (!FLAGS_key_space_dir.empty()) {
+ s = CreateOutputFile(type, new_stats.cf_name, "whole_key_prefix_cut.txt",
+ &new_stats.w_prefix_cut_f);
+ }
+
+ if (FLAGS_output_qps_stats) {
+ s = CreateOutputFile(type, new_stats.cf_name,
+ "accessed_top_k_qps_prefix_cut.txt",
+ &new_stats.a_top_qps_prefix_f);
+ }
+ }
+
+ if (FLAGS_output_time_series) {
+ s = CreateOutputFile(type, new_stats.cf_name, "time_series.txt",
+ &new_stats.time_series_f);
+ }
+
+ if (FLAGS_output_value_distribution) {
+ s = CreateOutputFile(type, new_stats.cf_name,
+ "accessed_value_size_distribution.txt",
+ &new_stats.a_value_size_f);
+ }
+
+ if (FLAGS_output_key_distribution) {
+ s = CreateOutputFile(type, new_stats.cf_name,
+ "accessed_key_size_distribution.txt",
+ &new_stats.a_key_size_f);
+ }
+
+ if (FLAGS_output_qps_stats) {
+ s = CreateOutputFile(type, new_stats.cf_name, "qps_stats.txt",
+ &new_stats.a_qps_f);
+ }
+
+ return Status::OK();
+}
+
+// create the output path of the files to be opened
+Status TraceAnalyzer::CreateOutputFile(
+ const std::string& type, const std::string& cf_name,
+ const std::string& ending,
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>* f_ptr) {
+ std::string path;
+ path = output_path_ + "/" + FLAGS_output_prefix + "-" + type + "-" + cf_name +
+ "-" + ending;
+ Status s;
+ s = env_->NewWritableFile(path, f_ptr, env_options_);
+ if (!s.ok()) {
+ fprintf(stderr, "Cannot open file: %s\n", path.c_str());
+ exit(1);
+ }
+ return Status::OK();
+}
+
+// Close the output files in the TraceStats if they are opened
+void TraceAnalyzer::CloseOutputFiles() {
+ for (int type = 0; type < kTaTypeNum; type++) {
+ if (!ta_[type].enabled) {
+ continue;
+ }
+ for (auto& stat : ta_[type].stats) {
+ if (stat.second.time_series_f) {
+ stat.second.time_series_f->Close();
+ }
+
+ if (stat.second.a_key_f) {
+ stat.second.a_key_f->Close();
+ }
+
+ if (stat.second.a_key_num_f) {
+ stat.second.a_key_num_f->Close();
+ }
+
+ if (stat.second.a_count_dist_f) {
+ stat.second.a_count_dist_f->Close();
+ }
+
+ if (stat.second.a_prefix_cut_f) {
+ stat.second.a_prefix_cut_f->Close();
+ }
+
+ if (stat.second.a_value_size_f) {
+ stat.second.a_value_size_f->Close();
+ }
+
+ if (stat.second.a_key_size_f) {
+ stat.second.a_key_size_f->Close();
+ }
+
+ if (stat.second.a_qps_f) {
+ stat.second.a_qps_f->Close();
+ }
+
+ if (stat.second.a_top_qps_prefix_f) {
+ stat.second.a_top_qps_prefix_f->Close();
+ }
+
+ if (stat.second.w_key_f) {
+ stat.second.w_key_f->Close();
+ }
+ if (stat.second.w_prefix_cut_f) {
+ stat.second.w_prefix_cut_f->Close();
+ }
+ }
+ }
+ return;
+}
+
+// Handle the Get request in the trace
+Status TraceAnalyzer::HandleGet(uint32_t column_family_id,
+ const std::string& key, const uint64_t& ts,
+ const uint32_t& get_ret) {
+ Status s;
+ size_t value_size = 0;
+ if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+ s = WriteTraceSequence(TraceOperationType::kGet, column_family_id, key,
+ value_size, ts);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to write the trace sequence to file");
+ }
+ }
+
+ if (ta_[TraceOperationType::kGet].sample_count >= sample_max_) {
+ ta_[TraceOperationType::kGet].sample_count = 0;
+ }
+ if (ta_[TraceOperationType::kGet].sample_count > 0) {
+ ta_[TraceOperationType::kGet].sample_count++;
+ return Status::OK();
+ }
+ ta_[TraceOperationType::kGet].sample_count++;
+
+ if (!ta_[TraceOperationType::kGet].enabled) {
+ return Status::OK();
+ }
+ if (get_ret == 1) {
+ value_size = 10;
+ }
+ s = KeyStatsInsertion(TraceOperationType::kGet, column_family_id, key,
+ value_size, ts);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to insert key statistics");
+ }
+ return s;
+}
+
+// Handle the Put request in the write batch of the trace
+Status TraceAnalyzer::HandlePut(uint32_t column_family_id, const Slice& key,
+ const Slice& value) {
+ Status s;
+ size_t value_size = value.ToString().size();
+ if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+ s = WriteTraceSequence(TraceOperationType::kPut, column_family_id,
+ key.ToString(), value_size, c_time_);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to write the trace sequence to file");
+ }
+ }
+
+ if (ta_[TraceOperationType::kPut].sample_count >= sample_max_) {
+ ta_[TraceOperationType::kPut].sample_count = 0;
+ }
+ if (ta_[TraceOperationType::kPut].sample_count > 0) {
+ ta_[TraceOperationType::kPut].sample_count++;
+ return Status::OK();
+ }
+ ta_[TraceOperationType::kPut].sample_count++;
+
+ if (!ta_[TraceOperationType::kPut].enabled) {
+ return Status::OK();
+ }
+ s = KeyStatsInsertion(TraceOperationType::kPut, column_family_id,
+ key.ToString(), value_size, c_time_);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to insert key statistics");
+ }
+ return s;
+}
+
+// Handle the Delete request in the write batch of the trace
+Status TraceAnalyzer::HandleDelete(uint32_t column_family_id,
+ const Slice& key) {
+ Status s;
+ size_t value_size = 0;
+ if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+ s = WriteTraceSequence(TraceOperationType::kDelete, column_family_id,
+ key.ToString(), value_size, c_time_);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to write the trace sequence to file");
+ }
+ }
+
+ if (ta_[TraceOperationType::kDelete].sample_count >= sample_max_) {
+ ta_[TraceOperationType::kDelete].sample_count = 0;
+ }
+ if (ta_[TraceOperationType::kDelete].sample_count > 0) {
+ ta_[TraceOperationType::kDelete].sample_count++;
+ return Status::OK();
+ }
+ ta_[TraceOperationType::kDelete].sample_count++;
+
+ if (!ta_[TraceOperationType::kDelete].enabled) {
+ return Status::OK();
+ }
+ s = KeyStatsInsertion(TraceOperationType::kDelete, column_family_id,
+ key.ToString(), value_size, c_time_);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to insert key statistics");
+ }
+ return s;
+}
+
+// Handle the SingleDelete request in the write batch of the trace
+Status TraceAnalyzer::HandleSingleDelete(uint32_t column_family_id,
+ const Slice& key) {
+ Status s;
+ size_t value_size = 0;
+ if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+ s = WriteTraceSequence(TraceOperationType::kSingleDelete, column_family_id,
+ key.ToString(), value_size, c_time_);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to write the trace sequence to file");
+ }
+ }
+
+ if (ta_[TraceOperationType::kSingleDelete].sample_count >= sample_max_) {
+ ta_[TraceOperationType::kSingleDelete].sample_count = 0;
+ }
+ if (ta_[TraceOperationType::kSingleDelete].sample_count > 0) {
+ ta_[TraceOperationType::kSingleDelete].sample_count++;
+ return Status::OK();
+ }
+ ta_[TraceOperationType::kSingleDelete].sample_count++;
+
+ if (!ta_[TraceOperationType::kSingleDelete].enabled) {
+ return Status::OK();
+ }
+ s = KeyStatsInsertion(TraceOperationType::kSingleDelete, column_family_id,
+ key.ToString(), value_size, c_time_);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to insert key statistics");
+ }
+ return s;
+}
+
+// Handle the DeleteRange request in the write batch of the trace
+Status TraceAnalyzer::HandleDeleteRange(uint32_t column_family_id,
+ const Slice& begin_key,
+ const Slice& end_key) {
+ Status s;
+ size_t value_size = 0;
+ if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+ s = WriteTraceSequence(TraceOperationType::kRangeDelete, column_family_id,
+ begin_key.ToString(), value_size, c_time_);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to write the trace sequence to file");
+ }
+ }
+
+ if (ta_[TraceOperationType::kRangeDelete].sample_count >= sample_max_) {
+ ta_[TraceOperationType::kRangeDelete].sample_count = 0;
+ }
+ if (ta_[TraceOperationType::kRangeDelete].sample_count > 0) {
+ ta_[TraceOperationType::kRangeDelete].sample_count++;
+ return Status::OK();
+ }
+ ta_[TraceOperationType::kRangeDelete].sample_count++;
+
+ if (!ta_[TraceOperationType::kRangeDelete].enabled) {
+ return Status::OK();
+ }
+ s = KeyStatsInsertion(TraceOperationType::kRangeDelete, column_family_id,
+ begin_key.ToString(), value_size, c_time_);
+ s = KeyStatsInsertion(TraceOperationType::kRangeDelete, column_family_id,
+ end_key.ToString(), value_size, c_time_);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to insert key statistics");
+ }
+ return s;
+}
+
+// Handle the Merge request in the write batch of the trace
+Status TraceAnalyzer::HandleMerge(uint32_t column_family_id, const Slice& key,
+ const Slice& value) {
+ Status s;
+ size_t value_size = value.ToString().size();
+ if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+ s = WriteTraceSequence(TraceOperationType::kMerge, column_family_id,
+ key.ToString(), value_size, c_time_);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to write the trace sequence to file");
+ }
+ }
+
+ if (ta_[TraceOperationType::kMerge].sample_count >= sample_max_) {
+ ta_[TraceOperationType::kMerge].sample_count = 0;
+ }
+ if (ta_[TraceOperationType::kMerge].sample_count > 0) {
+ ta_[TraceOperationType::kMerge].sample_count++;
+ return Status::OK();
+ }
+ ta_[TraceOperationType::kMerge].sample_count++;
+
+ if (!ta_[TraceOperationType::kMerge].enabled) {
+ return Status::OK();
+ }
+ s = KeyStatsInsertion(TraceOperationType::kMerge, column_family_id,
+ key.ToString(), value_size, c_time_);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to insert key statistics");
+ }
+ return s;
+}
+
+// Handle the Iterator request in the trace
+Status TraceAnalyzer::HandleIter(uint32_t column_family_id,
+ const std::string& key, const uint64_t& ts,
+ TraceType& trace_type) {
+ Status s;
+ size_t value_size = 0;
+ int type = -1;
+ if (trace_type == kTraceIteratorSeek) {
+ type = TraceOperationType::kIteratorSeek;
+ } else if (trace_type == kTraceIteratorSeekForPrev) {
+ type = TraceOperationType::kIteratorSeekForPrev;
+ } else {
+ return s;
+ }
+ if (type == -1) {
+ return s;
+ }
+
+ if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+ s = WriteTraceSequence(type, column_family_id, key, value_size, ts);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to write the trace sequence to file");
+ }
+ }
+
+ if (ta_[type].sample_count >= sample_max_) {
+ ta_[type].sample_count = 0;
+ }
+ if (ta_[type].sample_count > 0) {
+ ta_[type].sample_count++;
+ return Status::OK();
+ }
+ ta_[type].sample_count++;
+
+ if (!ta_[type].enabled) {
+ return Status::OK();
+ }
+ s = KeyStatsInsertion(type, column_family_id, key, value_size, ts);
+ if (!s.ok()) {
+ return Status::Corruption("Failed to insert key statistics");
+ }
+ return s;
+}
+
+// Before the analyzer is closed, the requested general statistic results are
+// printed out here. In current stage, these information are not output to
+// the files.
+// -----type
+// |__cf_id
+// |_statistics
+void TraceAnalyzer::PrintStatistics() {
+ for (int type = 0; type < kTaTypeNum; type++) {
+ if (!ta_[type].enabled) {
+ continue;
+ }
+ ta_[type].total_keys = 0;
+ ta_[type].total_access = 0;
+ ta_[type].total_succ_access = 0;
+ printf("\n################# Operation Type: %s #####################\n",
+ ta_[type].type_name.c_str());
+ if (qps_ave_.size() == kTaTypeNum + 1) {
+ printf("Peak QPS is: %u Average QPS is: %f\n", qps_peak_[type],
+ qps_ave_[type]);
+ }
+ for (auto& stat_it : ta_[type].stats) {
+ if (stat_it.second.a_count == 0) {
+ continue;
+ }
+ TraceStats& stat = stat_it.second;
+ uint64_t total_a_keys = static_cast<uint64_t>(stat.a_key_stats.size());
+ double key_size_ave = 0.0;
+ double value_size_ave = 0.0;
+ double key_size_vari = 0.0;
+ double value_size_vari = 0.0;
+ if (stat.a_count > 0) {
+ key_size_ave =
+ (static_cast<double>(stat.a_key_size_sum)) / stat.a_count;
+ value_size_ave =
+ (static_cast<double>(stat.a_value_size_sum)) / stat.a_count;
+ key_size_vari = std::sqrt((static_cast<double>(stat.a_key_size_sqsum)) /
+ stat.a_count -
+ key_size_ave * key_size_ave);
+ value_size_vari = std::sqrt(
+ (static_cast<double>(stat.a_value_size_sqsum)) / stat.a_count -
+ value_size_ave * value_size_ave);
+ }
+ if (value_size_ave == 0.0) {
+ stat.a_value_mid = 0;
+ }
+ cfs_[stat.cf_id].a_count += total_a_keys;
+ ta_[type].total_keys += total_a_keys;
+ ta_[type].total_access += stat.a_count;
+ ta_[type].total_succ_access += stat.a_succ_count;
+ printf("*********************************************************\n");
+ printf("colume family id: %u\n", stat.cf_id);
+ printf("Total number of queries to this cf by %s: %" PRIu64 "\n",
+ ta_[type].type_name.c_str(), stat.a_count);
+ printf("Total unique keys in this cf: %" PRIu64 "\n", total_a_keys);
+ printf("Average key size: %f key size medium: %" PRIu64
+ " Key size Variation: %f\n",
+ key_size_ave, stat.a_key_mid, key_size_vari);
+ if (type == kPut || type == kMerge) {
+ printf("Average value size: %f Value size medium: %" PRIu64
+ " Value size variation: %f\n",
+ value_size_ave, stat.a_value_mid, value_size_vari);
+ }
+ printf("Peak QPS is: %u Average QPS is: %f\n", stat.a_peak_qps,
+ stat.a_ave_qps);
+
+ // print the top k accessed key and its access count
+ if (FLAGS_print_top_k_access > 0) {
+ printf("The Top %d keys that are accessed:\n",
+ FLAGS_print_top_k_access);
+ while (!stat.top_k_queue.empty()) {
+ std::string hex_key = ROCKSDB_NAMESPACE::LDBCommand::StringToHex(
+ stat.top_k_queue.top().second);
+ printf("Access_count: %" PRIu64 " %s\n", stat.top_k_queue.top().first,
+ hex_key.c_str());
+ stat.top_k_queue.pop();
+ }
+ }
+
+ // print the top k access prefix range and
+ // top k prefix range with highest average access per key
+ if (FLAGS_output_prefix_cut > 0) {
+ printf("The Top %d accessed prefix range:\n", FLAGS_print_top_k_access);
+ while (!stat.top_k_prefix_access.empty()) {
+ printf("Prefix: %s Access count: %" PRIu64 "\n",
+ stat.top_k_prefix_access.top().second.c_str(),
+ stat.top_k_prefix_access.top().first);
+ stat.top_k_prefix_access.pop();
+ }
+
+ printf("The Top %d prefix with highest access per key:\n",
+ FLAGS_print_top_k_access);
+ while (!stat.top_k_prefix_ave.empty()) {
+ printf("Prefix: %s access per key: %f\n",
+ stat.top_k_prefix_ave.top().second.c_str(),
+ stat.top_k_prefix_ave.top().first);
+ stat.top_k_prefix_ave.pop();
+ }
+ }
+
+ // print the operation correlations
+ if (!FLAGS_print_correlation.empty()) {
+ for (int correlation = 0;
+ correlation <
+ static_cast<int>(analyzer_opts_.correlation_list.size());
+ correlation++) {
+ printf(
+ "The correlation statistics of '%s' after '%s' is:",
+ taIndexToOpt[analyzer_opts_.correlation_list[correlation].second]
+ .c_str(),
+ taIndexToOpt[analyzer_opts_.correlation_list[correlation].first]
+ .c_str());
+ double correlation_ave = 0.0;
+ if (stat.correlation_output[correlation].first > 0) {
+ correlation_ave =
+ (static_cast<double>(
+ stat.correlation_output[correlation].second)) /
+ (stat.correlation_output[correlation].first * 1000);
+ }
+ printf(" total numbers: %" PRIu64 " average time: %f(ms)\n",
+ stat.correlation_output[correlation].first, correlation_ave);
+ }
+ }
+ }
+ printf("*********************************************************\n");
+ printf("Total keys of '%s' is: %" PRIu64 "\n", ta_[type].type_name.c_str(),
+ ta_[type].total_keys);
+ printf("Total access is: %" PRIu64 "\n", ta_[type].total_access);
+ total_access_keys_ += ta_[type].total_keys;
+ }
+
+ // Print the overall statistic information of the trace
+ printf("\n*********************************************************\n");
+ printf("*********************************************************\n");
+ printf("The column family based statistics\n");
+ for (auto& cf : cfs_) {
+ printf("The column family id: %u\n", cf.first);
+ printf("The whole key space key numbers: %" PRIu64 "\n", cf.second.w_count);
+ printf("The accessed key space key numbers: %" PRIu64 "\n",
+ cf.second.a_count);
+ }
+
+ if (FLAGS_print_overall_stats) {
+ printf("\n*********************************************************\n");
+ printf("*********************************************************\n");
+ if (qps_peak_.size() == kTaTypeNum + 1) {
+ printf("Average QPS per second: %f Peak QPS: %u\n", qps_ave_[kTaTypeNum],
+ qps_peak_[kTaTypeNum]);
+ }
+ printf("The statistics related to query number need to times: %u\n",
+ sample_max_);
+ printf("Total_requests: %" PRIu64 " Total_accessed_keys: %" PRIu64
+ " Total_gets: %" PRIu64 " Total_write_batch: %" PRIu64 "\n",
+ total_requests_, total_access_keys_, total_gets_, total_writes_);
+ for (int type = 0; type < kTaTypeNum; type++) {
+ if (!ta_[type].enabled) {
+ continue;
+ }
+ printf("Operation: '%s' has: %" PRIu64 "\n", ta_[type].type_name.c_str(),
+ ta_[type].total_access);
+ }
+ }
+}
+
+// Write the trace sequence to file
+Status TraceAnalyzer::WriteTraceSequence(const uint32_t& type,
+ const uint32_t& cf_id,
+ const std::string& key,
+ const size_t value_size,
+ const uint64_t ts) {
+ std::string hex_key = ROCKSDB_NAMESPACE::LDBCommand::StringToHex(key);
+ int ret;
+ ret = snprintf(buffer_, sizeof(buffer_), "%u %u %zu %" PRIu64 "\n", type,
+ cf_id, value_size, ts);
+ if (ret < 0) {
+ return Status::IOError("failed to format the output");
+ }
+ std::string printout(buffer_);
+ if (!FLAGS_no_key) {
+ printout = hex_key + " " + printout;
+ }
+ return trace_sequence_f_->Append(printout);
+}
+
+// The entrance function of Trace_Analyzer
+int trace_analyzer_tool(int argc, char** argv) {
+ std::string trace_path;
+ std::string output_path;
+
+ AnalyzerOptions analyzer_opts;
+
+ ParseCommandLineFlags(&argc, &argv, true);
+
+ if (!FLAGS_print_correlation.empty()) {
+ analyzer_opts.SparseCorrelationInput(FLAGS_print_correlation);
+ }
+
+ std::unique_ptr<TraceAnalyzer> analyzer(
+ new TraceAnalyzer(FLAGS_trace_path, FLAGS_output_dir, analyzer_opts));
+
+ if (!analyzer) {
+ fprintf(stderr, "Cannot initiate the trace analyzer\n");
+ exit(1);
+ }
+
+ ROCKSDB_NAMESPACE::Status s = analyzer->PrepareProcessing();
+ if (!s.ok()) {
+ fprintf(stderr, "%s\n", s.getState());
+ fprintf(stderr, "Cannot initiate the trace reader\n");
+ exit(1);
+ }
+
+ s = analyzer->StartProcessing();
+ if (!s.ok() && !FLAGS_try_process_corrupted_trace) {
+ fprintf(stderr, "%s\n", s.getState());
+ fprintf(stderr, "Cannot processing the trace\n");
+ exit(1);
+ }
+
+ s = analyzer->MakeStatistics();
+ if (!s.ok()) {
+ fprintf(stderr, "%s\n", s.getState());
+ analyzer->EndProcessing();
+ fprintf(stderr, "Cannot make the statistics\n");
+ exit(1);
+ }
+
+ s = analyzer->ReProcessing();
+ if (!s.ok()) {
+ fprintf(stderr, "%s\n", s.getState());
+ fprintf(stderr, "Cannot re-process the trace for more statistics\n");
+ analyzer->EndProcessing();
+ exit(1);
+ }
+
+ s = analyzer->EndProcessing();
+ if (!s.ok()) {
+ fprintf(stderr, "%s\n", s.getState());
+ fprintf(stderr, "Cannot close the trace analyzer\n");
+ exit(1);
+ }
+
+ return 0;
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // Endif of Gflag
+#endif // RocksDB LITE
diff --git a/src/rocksdb/tools/trace_analyzer_tool.h b/src/rocksdb/tools/trace_analyzer_tool.h
new file mode 100644
index 000000000..d2df2c824
--- /dev/null
+++ b/src/rocksdb/tools/trace_analyzer_tool.h
@@ -0,0 +1,292 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <list>
+#include <map>
+#include <queue>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/write_batch.h"
+#include "trace_replay/trace_replay.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class WriteBatch;
+
+enum TraceOperationType : int {
+ kGet = 0,
+ kPut = 1,
+ kDelete = 2,
+ kSingleDelete = 3,
+ kRangeDelete = 4,
+ kMerge = 5,
+ kIteratorSeek = 6,
+ kIteratorSeekForPrev = 7,
+ kTaTypeNum = 8
+};
+
+struct TraceUnit {
+ uint64_t ts;
+ uint32_t type;
+ uint32_t cf_id;
+ size_t value_size;
+ std::string key;
+};
+
+struct TypeCorrelation {
+ uint64_t count;
+ uint64_t total_ts;
+};
+
+struct StatsUnit {
+ uint64_t key_id;
+ uint64_t access_count;
+ uint64_t latest_ts;
+ uint64_t succ_count; // current only used to count Get if key found
+ uint32_t cf_id;
+ size_t value_size;
+ std::vector<TypeCorrelation> v_correlation;
+};
+
+class AnalyzerOptions {
+ public:
+ std::vector<std::vector<int>> correlation_map;
+ std::vector<std::pair<int, int>> correlation_list;
+
+ AnalyzerOptions();
+
+ ~AnalyzerOptions();
+
+ void SparseCorrelationInput(const std::string& in_str);
+};
+
+// Note that, for the variable names in the trace_analyzer,
+// Starting with 'a_' means the variable is used for 'accessed_keys'.
+// Starting with 'w_' means it is used for 'the whole key space'.
+// Ending with '_f' means a file write or reader pointer.
+// For example, 'a_count' means 'accessed_keys_count',
+// 'w_key_f' means 'whole_key_space_file'.
+
+struct TraceStats {
+ uint32_t cf_id;
+ std::string cf_name;
+ uint64_t a_count;
+ uint64_t a_succ_count;
+ uint64_t a_key_id;
+ uint64_t a_key_size_sqsum;
+ uint64_t a_key_size_sum;
+ uint64_t a_key_mid;
+ uint64_t a_value_size_sqsum;
+ uint64_t a_value_size_sum;
+ uint64_t a_value_mid;
+ uint32_t a_peak_qps;
+ double a_ave_qps;
+ std::map<std::string, StatsUnit> a_key_stats;
+ std::map<uint64_t, uint64_t> a_count_stats;
+ std::map<uint64_t, uint64_t> a_key_size_stats;
+ std::map<uint64_t, uint64_t> a_value_size_stats;
+ std::map<uint32_t, uint32_t> a_qps_stats;
+ std::map<uint32_t, std::map<std::string, uint32_t>> a_qps_prefix_stats;
+ std::priority_queue<std::pair<uint64_t, std::string>,
+ std::vector<std::pair<uint64_t, std::string>>,
+ std::greater<std::pair<uint64_t, std::string>>>
+ top_k_queue;
+ std::priority_queue<std::pair<uint64_t, std::string>,
+ std::vector<std::pair<uint64_t, std::string>>,
+ std::greater<std::pair<uint64_t, std::string>>>
+ top_k_prefix_access;
+ std::priority_queue<std::pair<double, std::string>,
+ std::vector<std::pair<double, std::string>>,
+ std::greater<std::pair<double, std::string>>>
+ top_k_prefix_ave;
+ std::priority_queue<std::pair<uint32_t, uint32_t>,
+ std::vector<std::pair<uint32_t, uint32_t>>,
+ std::greater<std::pair<uint32_t, uint32_t>>>
+ top_k_qps_sec;
+ std::list<TraceUnit> time_series;
+ std::vector<std::pair<uint64_t, uint64_t>> correlation_output;
+ std::map<uint32_t, uint64_t> uni_key_num;
+
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> time_series_f;
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_key_f;
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_count_dist_f;
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_prefix_cut_f;
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_value_size_f;
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_key_size_f;
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_key_num_f;
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_qps_f;
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_top_qps_prefix_f;
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> w_key_f;
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> w_prefix_cut_f;
+
+ TraceStats();
+ ~TraceStats();
+ TraceStats(const TraceStats&) = delete;
+ TraceStats& operator=(const TraceStats&) = delete;
+ TraceStats(TraceStats&&) = default;
+ TraceStats& operator=(TraceStats&&) = default;
+};
+
+struct TypeUnit {
+ std::string type_name;
+ bool enabled;
+ uint64_t total_keys;
+ uint64_t total_access;
+ uint64_t total_succ_access;
+ uint32_t sample_count;
+ std::map<uint32_t, TraceStats> stats;
+ TypeUnit() = default;
+ ~TypeUnit() = default;
+ TypeUnit(const TypeUnit&) = delete;
+ TypeUnit& operator=(const TypeUnit&) = delete;
+ TypeUnit(TypeUnit&&) = default;
+ TypeUnit& operator=(TypeUnit&&) = default;
+};
+
+struct CfUnit {
+ uint32_t cf_id;
+ uint64_t w_count; // total keys in this cf if we use the whole key space
+ uint64_t a_count; // the total keys in this cf that are accessed
+ std::map<uint64_t, uint64_t> w_key_size_stats; // whole key space key size
+ // statistic this cf
+ std::map<uint32_t, uint32_t> cf_qps;
+};
+
+class TraceAnalyzer {
+ public:
+ TraceAnalyzer(std::string& trace_path, std::string& output_path,
+ AnalyzerOptions _analyzer_opts);
+ ~TraceAnalyzer();
+
+ Status PrepareProcessing();
+
+ Status StartProcessing();
+
+ Status MakeStatistics();
+
+ Status ReProcessing();
+
+ Status EndProcessing();
+
+ Status WriteTraceUnit(TraceUnit& unit);
+
+ // The trace processing functions for different type
+ Status HandleGet(uint32_t column_family_id, const std::string& key,
+ const uint64_t& ts, const uint32_t& get_ret);
+ Status HandlePut(uint32_t column_family_id, const Slice& key,
+ const Slice& value);
+ Status HandleDelete(uint32_t column_family_id, const Slice& key);
+ Status HandleSingleDelete(uint32_t column_family_id, const Slice& key);
+ Status HandleDeleteRange(uint32_t column_family_id, const Slice& begin_key,
+ const Slice& end_key);
+ Status HandleMerge(uint32_t column_family_id, const Slice& key,
+ const Slice& value);
+ Status HandleIter(uint32_t column_family_id, const std::string& key,
+ const uint64_t& ts, TraceType& trace_type);
+ std::vector<TypeUnit>& GetTaVector() { return ta_; }
+
+ private:
+ ROCKSDB_NAMESPACE::Env* env_;
+ EnvOptions env_options_;
+ std::unique_ptr<TraceReader> trace_reader_;
+ size_t offset_;
+ char buffer_[1024];
+ uint64_t c_time_;
+ std::string trace_name_;
+ std::string output_path_;
+ AnalyzerOptions analyzer_opts_;
+ uint64_t total_requests_;
+ uint64_t total_access_keys_;
+ uint64_t total_gets_;
+ uint64_t total_writes_;
+ uint64_t trace_create_time_;
+ uint64_t begin_time_;
+ uint64_t end_time_;
+ uint64_t time_series_start_;
+ uint32_t sample_max_;
+ uint32_t cur_time_sec_;
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>
+ trace_sequence_f_; // readable trace
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> qps_f_; // overall qps
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>
+ cf_qps_f_; // The qps of each CF>
+ std::vector<TypeUnit> ta_; // The main statistic collecting data structure
+ std::map<uint32_t, CfUnit> cfs_; // All the cf_id appears in this trace;
+ std::vector<uint32_t> qps_peak_;
+ std::vector<double> qps_ave_;
+
+ Status ReadTraceHeader(Trace* header);
+ Status ReadTraceFooter(Trace* footer);
+ Status ReadTraceRecord(Trace* trace);
+ Status KeyStatsInsertion(const uint32_t& type, const uint32_t& cf_id,
+ const std::string& key, const size_t value_size,
+ const uint64_t ts);
+ Status StatsUnitCorrelationUpdate(StatsUnit& unit, const uint32_t& type,
+ const uint64_t& ts, const std::string& key);
+ Status OpenStatsOutputFiles(const std::string& type, TraceStats& new_stats);
+ Status CreateOutputFile(
+ const std::string& type, const std::string& cf_name,
+ const std::string& ending,
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>* f_ptr);
+ void CloseOutputFiles();
+
+ void PrintStatistics();
+ Status TraceUnitWriter(
+ std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>& f_ptr, TraceUnit& unit);
+ Status WriteTraceSequence(const uint32_t& type, const uint32_t& cf_id,
+ const std::string& key, const size_t value_size,
+ const uint64_t ts);
+ Status MakeStatisticKeyStatsOrPrefix(TraceStats& stats);
+ Status MakeStatisticCorrelation(TraceStats& stats, StatsUnit& unit);
+ Status MakeStatisticQPS();
+};
+
+// write bach handler to be used for WriteBache iterator
+// when processing the write trace
+class TraceWriteHandler : public WriteBatch::Handler {
+ public:
+ TraceWriteHandler() { ta_ptr = nullptr; }
+ explicit TraceWriteHandler(TraceAnalyzer* _ta_ptr) { ta_ptr = _ta_ptr; }
+ ~TraceWriteHandler() {}
+
+ virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ return ta_ptr->HandlePut(column_family_id, key, value);
+ }
+ virtual Status DeleteCF(uint32_t column_family_id,
+ const Slice& key) override {
+ return ta_ptr->HandleDelete(column_family_id, key);
+ }
+ virtual Status SingleDeleteCF(uint32_t column_family_id,
+ const Slice& key) override {
+ return ta_ptr->HandleSingleDelete(column_family_id, key);
+ }
+ virtual Status DeleteRangeCF(uint32_t column_family_id,
+ const Slice& begin_key,
+ const Slice& end_key) override {
+ return ta_ptr->HandleDeleteRange(column_family_id, begin_key, end_key);
+ }
+ virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+ const Slice& value) override {
+ return ta_ptr->HandleMerge(column_family_id, key, value);
+ }
+
+ private:
+ TraceAnalyzer* ta_ptr;
+};
+
+int trace_analyzer_tool(int argc, char** argv);
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/verify_random_db.sh b/src/rocksdb/tools/verify_random_db.sh
new file mode 100755
index 000000000..817e4b984
--- /dev/null
+++ b/src/rocksdb/tools/verify_random_db.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# A shell script to verify DB generated by generate_random_db.sh cannot opened and read correct data.
+# ./ldb needs to be avaible to be executed.
+#
+# Usage: <SCRIPT> <DB Path>
+
+scriptpath=`dirname $BASH_SOURCE`
+if [ "$#" -lt 2 ]; then
+ echo "usage: $BASH_SOURCE <db_directory> <compare_base_db_directory> [dump_file_name] [if_try_load_options] [if_ignore_unknown_options]"
+ exit 1
+fi
+
+db_dir=$1
+base_db_dir=$2
+dump_file_name=${3:-"dump_file.txt"}
+try_load_options=${4:-"1"}
+ignore_unknown_options=${5:-"0"}
+db_dump=$db_dir"/"$dump_file_name
+base_db_dump=$base_db_dir"/"$dump_file_name
+extra_param=
+
+if [ "$try_load_options" = "1" ]; then
+ extra_param=" --try_load_options "
+fi
+
+if [ "$ignore_unknown_options" = "1" ]; then
+ extra_param=" --ignore_unknown_options "
+fi
+
+set -e
+echo == Dumping data from $db_dir to $db_dump
+./ldb dump --db=$db_dir $extra_param > $db_dump
+
+echo == Dumping data from $base_db_dir to $base_db_dump
+./ldb dump --db=$base_db_dir $extra_param > $base_db_dump
+
+diff $db_dump $base_db_dir
diff --git a/src/rocksdb/tools/write_external_sst.sh b/src/rocksdb/tools/write_external_sst.sh
new file mode 100755
index 000000000..3b02d082f
--- /dev/null
+++ b/src/rocksdb/tools/write_external_sst.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+#
+
+if [ "$#" -lt 3 ]; then
+ echo "usage: $BASH_SOURCE <input_data_path> <DB Path> <extern SST dir>"
+ exit 1
+fi
+
+input_data_dir=$1
+db_dir=$2
+extern_sst_dir=$3
+rm -rf $db_dir
+
+set -e
+
+n=0
+
+for f in `find $input_data_dir -name sorted_data*`
+do
+ echo == Writing external SST file $f to $extern_sst_dir/extern_sst${n}
+ ./ldb --db=$db_dir --create_if_missing write_extern_sst $extern_sst_dir/extern_sst${n} < $f
+ let "n = n + 1"
+done
diff --git a/src/rocksdb/tools/write_stress.cc b/src/rocksdb/tools/write_stress.cc
new file mode 100644
index 000000000..15b1da881
--- /dev/null
+++ b/src/rocksdb/tools/write_stress.cc
@@ -0,0 +1,305 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+//
+// The goal of this tool is to be a simple stress test with focus on catching:
+// * bugs in compaction/flush processes, especially the ones that cause
+// assertion errors
+// * bugs in the code that deletes obsolete files
+//
+// There are two parts of the test:
+// * write_stress, a binary that writes to the database
+// * write_stress_runner.py, a script that invokes and kills write_stress
+//
+// Here are some interesting parts of write_stress:
+// * Runs with very high concurrency of compactions and flushes (32 threads
+// total) and tries to create a huge amount of small files
+// * The keys written to the database are not uniformly distributed -- there is
+// a 3-character prefix that mutates occasionally (in prefix mutator thread), in
+// such a way that the first character mutates slower than second, which mutates
+// slower than third character. That way, the compaction stress tests some
+// interesting compaction features like trivial moves and bottommost level
+// calculation
+// * There is a thread that creates an iterator, holds it for couple of seconds
+// and then iterates over all keys. This is supposed to test RocksDB's abilities
+// to keep the files alive when there are references to them.
+// * Some writes trigger WAL sync. This is stress testing our WAL sync code.
+// * At the end of the run, we make sure that we didn't leak any of the sst
+// files
+//
+// write_stress_runner.py changes the mode in which we run write_stress and also
+// kills and restarts it. There are some interesting characteristics:
+// * At the beginning we divide the full test runtime into smaller parts --
+// shorter runtimes (couple of seconds) and longer runtimes (100, 1000) seconds
+// * The first time we run write_stress, we destroy the old DB. Every next time
+// during the test, we use the same DB.
+// * We can run in kill mode or clean-restart mode. Kill mode kills the
+// write_stress violently.
+// * We can run in mode where delete_obsolete_files_with_fullscan is true or
+// false
+// * We can run with low_open_files mode turned on or off. When it's turned on,
+// we configure table cache to only hold a couple of files -- that way we need
+// to reopen files every time we access them.
+//
+// Another goal was to create a stress test without a lot of parameters. So
+// tools/write_stress_runner.py should only take one parameter -- runtime_sec
+// and it should figure out everything else on its own.
+
+#include <cstdio>
+
+#ifndef GFLAGS
+int main() {
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+ return 1;
+}
+#else
+
+#include <atomic>
+#include <cinttypes>
+#include <random>
+#include <set>
+#include <string>
+#include <thread>
+
+#include "file/filename.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+DEFINE_int32(key_size, 10, "Key size");
+DEFINE_int32(value_size, 100, "Value size");
+DEFINE_string(db, "", "Use the db with the following name.");
+DEFINE_bool(destroy_db, true,
+ "Destroy the existing DB before running the test");
+
+DEFINE_int32(runtime_sec, 10 * 60, "How long are we running for, in seconds");
+DEFINE_int32(seed, 139, "Random seed");
+
+DEFINE_double(prefix_mutate_period_sec, 1.0,
+ "How often are we going to mutate the prefix");
+DEFINE_double(first_char_mutate_probability, 0.1,
+ "How likely are we to mutate the first char every period");
+DEFINE_double(second_char_mutate_probability, 0.2,
+ "How likely are we to mutate the second char every period");
+DEFINE_double(third_char_mutate_probability, 0.5,
+ "How likely are we to mutate the third char every period");
+
+DEFINE_int32(iterator_hold_sec, 5,
+ "How long will the iterator hold files before it gets destroyed");
+
+DEFINE_double(sync_probability, 0.01, "How often are we syncing writes");
+DEFINE_bool(delete_obsolete_files_with_fullscan, false,
+ "If true, we delete obsolete files after each compaction/flush "
+ "using GetChildren() API");
+DEFINE_bool(low_open_files_mode, false,
+ "If true, we set max_open_files to 20, so that every file access "
+ "needs to reopen it");
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kPrefixSize = 3;
+
+class WriteStress {
+ public:
+ WriteStress() : stop_(false) {
+ // initialize key_prefix
+ for (int i = 0; i < kPrefixSize; ++i) {
+ key_prefix_[i].store('a');
+ }
+
+ // Choose a location for the test database if none given with --db=<path>
+ if (FLAGS_db.empty()) {
+ std::string default_db_path;
+ Env::Default()->GetTestDirectory(&default_db_path);
+ default_db_path += "/write_stress";
+ FLAGS_db = default_db_path;
+ }
+
+ Options options;
+ if (FLAGS_destroy_db) {
+ DestroyDB(FLAGS_db, options); // ignore
+ }
+
+ // make the LSM tree deep, so that we have many concurrent flushes and
+ // compactions
+ options.create_if_missing = true;
+ options.write_buffer_size = 256 * 1024; // 256k
+ options.max_bytes_for_level_base = 1 * 1024 * 1024; // 1MB
+ options.target_file_size_base = 100 * 1024; // 100k
+ options.max_write_buffer_number = 16;
+ options.max_background_compactions = 16;
+ options.max_background_flushes = 16;
+ options.max_open_files = FLAGS_low_open_files_mode ? 20 : -1;
+ if (FLAGS_delete_obsolete_files_with_fullscan) {
+ options.delete_obsolete_files_period_micros = 0;
+ }
+
+ // open DB
+ DB* db;
+ Status s = DB::Open(options, FLAGS_db, &db);
+ if (!s.ok()) {
+ fprintf(stderr, "Can't open database: %s\n", s.ToString().c_str());
+ std::abort();
+ }
+ db_.reset(db);
+ }
+
+ void WriteThread() {
+ std::mt19937 rng(static_cast<unsigned int>(FLAGS_seed));
+ std::uniform_real_distribution<double> dist(0, 1);
+
+ auto random_string = [](std::mt19937& r, int len) {
+ std::uniform_int_distribution<int> char_dist('a', 'z');
+ std::string ret;
+ for (int i = 0; i < len; ++i) {
+ ret += static_cast<char>(char_dist(r));
+ }
+ return ret;
+ };
+
+ while (!stop_.load(std::memory_order_relaxed)) {
+ std::string prefix;
+ prefix.resize(kPrefixSize);
+ for (int i = 0; i < kPrefixSize; ++i) {
+ prefix[i] = key_prefix_[i].load(std::memory_order_relaxed);
+ }
+ auto key = prefix + random_string(rng, FLAGS_key_size - kPrefixSize);
+ auto value = random_string(rng, FLAGS_value_size);
+ WriteOptions woptions;
+ woptions.sync = dist(rng) < FLAGS_sync_probability;
+ auto s = db_->Put(woptions, key, value);
+ if (!s.ok()) {
+ fprintf(stderr, "Write to DB failed: %s\n", s.ToString().c_str());
+ std::abort();
+ }
+ }
+ }
+
+ void IteratorHoldThread() {
+ while (!stop_.load(std::memory_order_relaxed)) {
+ std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+ Env::Default()->SleepForMicroseconds(FLAGS_iterator_hold_sec * 1000 *
+ 1000LL);
+ for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+ }
+ if (!iterator->status().ok()) {
+ fprintf(stderr, "Iterator statuts not OK: %s\n",
+ iterator->status().ToString().c_str());
+ std::abort();
+ }
+ }
+ }
+
+ void PrefixMutatorThread() {
+ std::mt19937 rng(static_cast<unsigned int>(FLAGS_seed));
+ std::uniform_real_distribution<double> dist(0, 1);
+ std::uniform_int_distribution<int> char_dist('a', 'z');
+ while (!stop_.load(std::memory_order_relaxed)) {
+ Env::Default()->SleepForMicroseconds(static_cast<int>(
+ FLAGS_prefix_mutate_period_sec *
+ 1000 * 1000LL));
+ if (dist(rng) < FLAGS_first_char_mutate_probability) {
+ key_prefix_[0].store(static_cast<char>(char_dist(rng)), std::memory_order_relaxed);
+ }
+ if (dist(rng) < FLAGS_second_char_mutate_probability) {
+ key_prefix_[1].store(static_cast<char>(char_dist(rng)), std::memory_order_relaxed);
+ }
+ if (dist(rng) < FLAGS_third_char_mutate_probability) {
+ key_prefix_[2].store(static_cast<char>(char_dist(rng)), std::memory_order_relaxed);
+ }
+ }
+ }
+
+ int Run() {
+ threads_.emplace_back([&]() { WriteThread(); });
+ threads_.emplace_back([&]() { PrefixMutatorThread(); });
+ threads_.emplace_back([&]() { IteratorHoldThread(); });
+
+ if (FLAGS_runtime_sec == -1) {
+ // infinite runtime, until we get killed
+ while (true) {
+ Env::Default()->SleepForMicroseconds(1000 * 1000);
+ }
+ }
+
+ Env::Default()->SleepForMicroseconds(FLAGS_runtime_sec * 1000 * 1000);
+
+ stop_.store(true, std::memory_order_relaxed);
+ for (auto& t : threads_) {
+ t.join();
+ }
+ threads_.clear();
+
+// Skip checking for leaked files in ROCKSDB_LITE since we don't have access to
+// function GetLiveFilesMetaData
+#ifndef ROCKSDB_LITE
+ // let's see if we leaked some files
+ db_->PauseBackgroundWork();
+ std::vector<LiveFileMetaData> metadata;
+ db_->GetLiveFilesMetaData(&metadata);
+ std::set<uint64_t> sst_file_numbers;
+ for (const auto& file : metadata) {
+ uint64_t number;
+ FileType type;
+ if (!ParseFileName(file.name, &number, "LOG", &type)) {
+ continue;
+ }
+ if (type == kTableFile) {
+ sst_file_numbers.insert(number);
+ }
+ }
+
+ std::vector<std::string> children;
+ Env::Default()->GetChildren(FLAGS_db, &children);
+ for (const auto& child : children) {
+ uint64_t number;
+ FileType type;
+ if (!ParseFileName(child, &number, "LOG", &type)) {
+ continue;
+ }
+ if (type == kTableFile) {
+ if (sst_file_numbers.find(number) == sst_file_numbers.end()) {
+ fprintf(stderr,
+ "Found a table file in DB path that should have been "
+ "deleted: %s\n",
+ child.c_str());
+ std::abort();
+ }
+ }
+ }
+ db_->ContinueBackgroundWork();
+#endif // !ROCKSDB_LITE
+
+ return 0;
+ }
+
+ private:
+ // each key is prepended with this prefix. we occasionally change it. third
+ // letter is changed more frequently than second, which is changed more
+ // frequently than the first one.
+ std::atomic<char> key_prefix_[kPrefixSize];
+ std::atomic<bool> stop_;
+ std::vector<port::Thread> threads_;
+ std::unique_ptr<DB> db_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+ " [OPTIONS]...");
+ ParseCommandLineFlags(&argc, &argv, true);
+ ROCKSDB_NAMESPACE::WriteStress write_stress;
+ return write_stress.Run();
+}
+
+#endif // GFLAGS
diff --git a/src/rocksdb/tools/write_stress_runner.py b/src/rocksdb/tools/write_stress_runner.py
new file mode 100644
index 000000000..fc0c99c23
--- /dev/null
+++ b/src/rocksdb/tools/write_stress_runner.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python2
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import subprocess
+import argparse
+import random
+import time
+import sys
+
+
+def generate_runtimes(total_runtime):
+ # combination of short runtimes and long runtimes, with heavier
+ # weight on short runtimes
+ possible_runtimes_sec = range(1, 10) + range(1, 20) + [100, 1000]
+ runtimes = []
+ while total_runtime > 0:
+ chosen = random.choice(possible_runtimes_sec)
+ chosen = min(chosen, total_runtime)
+ runtimes.append(chosen)
+ total_runtime -= chosen
+ return runtimes
+
+
+def main(args):
+ runtimes = generate_runtimes(int(args.runtime_sec))
+ print "Going to execute write stress for " + str(runtimes) # noqa: E999 T25377293 Grandfathered in
+ first_time = True
+
+ for runtime in runtimes:
+ kill = random.choice([False, True])
+
+ cmd = './write_stress --runtime_sec=' + \
+ ("-1" if kill else str(runtime))
+
+ if len(args.db) > 0:
+ cmd = cmd + ' --db=' + args.db
+
+ if first_time:
+ first_time = False
+ else:
+ # use current db
+ cmd = cmd + ' --destroy_db=false'
+ if random.choice([False, True]):
+ cmd = cmd + ' --delete_obsolete_files_with_fullscan=true'
+ if random.choice([False, True]):
+ cmd = cmd + ' --low_open_files_mode=true'
+
+ print("Running write_stress for %d seconds (%s): %s" %
+ (runtime, ("kill-mode" if kill else "clean-shutdown-mode"),
+ cmd))
+
+ child = subprocess.Popen([cmd], shell=True)
+ killtime = time.time() + runtime
+ while not kill or time.time() < killtime:
+ time.sleep(1)
+ if child.poll() is not None:
+ if child.returncode == 0:
+ break
+ else:
+ print("ERROR: write_stress died with exitcode=%d\n"
+ % child.returncode)
+ sys.exit(1)
+ if kill:
+ child.kill()
+ # breathe
+ time.sleep(3)
+
+if __name__ == '__main__':
+ random.seed(time.time())
+ parser = argparse.ArgumentParser(description="This script runs and kills \
+ write_stress multiple times")
+ parser.add_argument("--runtime_sec", default='1000')
+ parser.add_argument("--db", default='')
+ args = parser.parse_args()
+ main(args)