diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/rocksdb/tools | |
parent | Initial commit. (diff) | |
download | ceph-b26c4052f3542036551aa9dec9caa4226e456195.tar.xz ceph-b26c4052f3542036551aa9dec9caa4226e456195.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
95 files changed, 39204 insertions, 0 deletions
diff --git a/src/rocksdb/tools/CMakeLists.txt b/src/rocksdb/tools/CMakeLists.txt new file mode 100644 index 000000000..19030e84b --- /dev/null +++ b/src/rocksdb/tools/CMakeLists.txt @@ -0,0 +1,30 @@ +set(CORE_TOOLS + sst_dump.cc + ldb.cc) +foreach(src ${CORE_TOOLS}) + get_filename_component(exename ${src} NAME_WE) + add_executable(${exename}${ARTIFACT_SUFFIX} + ${src}) + target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${ROCKSDB_LIB}) + list(APPEND core_tool_deps ${exename}) +endforeach() + +if(WITH_TOOLS) + set(TOOLS + db_sanity_test.cc + write_stress.cc + db_repl_stress.cc + dump/rocksdb_dump.cc + dump/rocksdb_undump.cc) + foreach(src ${TOOLS}) + get_filename_component(exename ${src} NAME_WE) + add_executable(${exename}${ARTIFACT_SUFFIX} + ${src}) + target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${THIRDPARTY_LIBS}) + list(APPEND tool_deps ${exename}) + endforeach() + + add_custom_target(ldb_tests + COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/ldb_tests.py + DEPENDS ldb) +endif() diff --git a/src/rocksdb/tools/Dockerfile b/src/rocksdb/tools/Dockerfile new file mode 100644 index 000000000..1d5ead7fd --- /dev/null +++ b/src/rocksdb/tools/Dockerfile @@ -0,0 +1,5 @@ +FROM buildpack-deps:wheezy + +ADD ./ldb /rocksdb/tools/ldb + +CMD /rocksdb/tools/ldb diff --git a/src/rocksdb/tools/advisor/README.md b/src/rocksdb/tools/advisor/README.md new file mode 100644 index 000000000..b02d7ec50 --- /dev/null +++ b/src/rocksdb/tools/advisor/README.md @@ -0,0 +1,96 @@ +# Rocksdb Tuning Advisor + +## Motivation + +The performance of Rocksdb is contingent on its tuning. However, +because of the complexity of its underlying technology and a large number of +configurable parameters, a good configuration is sometimes hard to obtain. The aim of +the python command-line tool, Rocksdb Advisor, is to automate the process of +suggesting improvements in the configuration based on advice from Rocksdb +experts. + +## Overview + +Experts share their wisdom as rules comprising of conditions and suggestions in the INI format (refer +[rules.ini](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rules.ini)). +Users provide the Rocksdb configuration that they want to improve upon (as the +familiar Rocksdb OPTIONS file — +[example](https://github.com/facebook/rocksdb/blob/main/examples/rocksdb_option_file_example.ini)) +and the path of the file which contains Rocksdb logs and statistics. +The [Advisor](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rule_parser_example.py) +creates appropriate DataSource objects (for Rocksdb +[logs](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_log_parser.py), +[options](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_options_parser.py), +[statistics](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_stats_fetcher.py) etc.) +and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rule_parser.py). +The Rules uses rules from experts to parse data-sources and trigger appropriate rules. +The Advisor's output gives information about which rules were triggered, +why they were triggered and what each of them suggests. Each suggestion +provided by a triggered rule advises some action on a Rocksdb +configuration option, for example, increase CFOptions.write_buffer_size, +set bloom_bits to 2 etc. + +## Usage + +### Prerequisites +The tool needs the following to run: +* python3 + +### Running the tool +An example command to run the tool: + +```shell +cd rocksdb/tools/advisor +python3 -m advisor.rule_parser_example --rules_spec=advisor/rules.ini --rocksdb_options=test/input_files/OPTIONS-000005 --log_files_path_prefix=test/input_files/LOG-0 --stats_dump_period_sec=20 +``` + +### Command-line arguments + +Most important amongst all the input that the Advisor needs, are the rules +spec and starting Rocksdb configuration. The configuration is provided as the +familiar Rocksdb Options file (refer [example](https://github.com/facebook/rocksdb/blob/main/examples/rocksdb_option_file_example.ini)). +The Rules spec is written in the INI format (more details in +[rules.ini](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rules.ini)). + +In brief, a Rule is made of conditions and is triggered when all its +constituent conditions are triggered. When triggered, a Rule suggests changes +(increase/decrease/set to a suggested value) to certain Rocksdb options that +aim to improve Rocksdb performance. Every Condition has a 'source' i.e. +the data source that would be checked for triggering that condition. +For example, a log Condition (with 'source=LOG') is triggered if a particular +'regex' is found in the Rocksdb LOG files. As of now the Rules Engine +supports 3 types of Conditions (and consequently data-sources): +LOG, OPTIONS, TIME_SERIES. The TIME_SERIES data can be sourced from the +Rocksdb [statistics](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/statistics.h) +or [perf context](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/perf_context.h). + +For more information about the remaining command-line arguments, run: + +```shell +cd rocksdb/tools/advisor +python3 -m advisor.rule_parser_example --help +``` + +### Sample output + +Here, a Rocksdb log-based rule has been triggered: + +```shell +Rule: stall-too-many-memtables +LogCondition: stall-too-many-memtables regex: Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+ +Suggestion: inc-bg-flush option : DBOptions.max_background_flushes action : increase suggested_values : ['2'] +Suggestion: inc-write-buffer option : CFOptions.max_write_buffer_number action : increase +scope: col_fam: +{'default'} +``` + +## Running the tests + +Tests for the code have been added to the +[test/](https://github.com/facebook/rocksdb/tree/main/tools/advisor/test) +directory. For example, to run the unit tests for db_log_parser.py: + +```shell +cd rocksdb/tools/advisor +python3 -m unittest -v test.test_db_log_parser +``` diff --git a/src/rocksdb/tools/advisor/advisor/__init__.py b/src/rocksdb/tools/advisor/advisor/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/src/rocksdb/tools/advisor/advisor/__init__.py diff --git a/src/rocksdb/tools/advisor/advisor/bench_runner.py b/src/rocksdb/tools/advisor/advisor/bench_runner.py new file mode 100644 index 000000000..45d6c8313 --- /dev/null +++ b/src/rocksdb/tools/advisor/advisor/bench_runner.py @@ -0,0 +1,39 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import re +from abc import ABC, abstractmethod + + +class BenchmarkRunner(ABC): + @staticmethod + @abstractmethod + def is_metric_better(new_metric, old_metric): + pass + + @abstractmethod + def run_experiment(self): + # should return a list of DataSource objects + pass + + @staticmethod + def get_info_log_file_name(log_dir, db_path): + # Example: DB Path = /dev/shm and OPTIONS file has option + # db_log_dir=/tmp/rocks/, then the name of the log file will be + # 'dev_shm_LOG' and its location will be /tmp/rocks. If db_log_dir is + # not specified in the OPTIONS file, then the location of the log file + # will be /dev/shm and the name of the file will be 'LOG' + file_name = "" + if log_dir: + # refer GetInfoLogPrefix() in rocksdb/util/filename.cc + # example db_path: /dev/shm/dbbench + file_name = db_path[1:] # to ignore the leading '/' character + to_be_replaced = re.compile("[^0-9a-zA-Z\-_\.]") # noqa + for character in to_be_replaced.findall(db_path): + file_name = file_name.replace(character, "_") + if not file_name.endswith("_"): + file_name += "_" + file_name += "LOG" + return file_name diff --git a/src/rocksdb/tools/advisor/advisor/config_optimizer_example.py b/src/rocksdb/tools/advisor/advisor/config_optimizer_example.py new file mode 100644 index 000000000..40e2bb953 --- /dev/null +++ b/src/rocksdb/tools/advisor/advisor/config_optimizer_example.py @@ -0,0 +1,140 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import argparse + +from advisor.db_config_optimizer import ConfigOptimizer +from advisor.db_log_parser import NO_COL_FAMILY +from advisor.db_options_parser import DatabaseOptions +from advisor.rule_parser import RulesSpec + + +CONFIG_OPT_NUM_ITER = 10 + + +def main(args): + # initialise the RulesSpec parser + rule_spec_parser = RulesSpec(args.rules_spec) + # initialise the benchmark runner + bench_runner_module = __import__( + args.benchrunner_module, fromlist=[args.benchrunner_class] + ) + bench_runner_class = getattr(bench_runner_module, args.benchrunner_class) + ods_args = {} + if args.ods_client and args.ods_entity: + ods_args["client_script"] = args.ods_client + ods_args["entity"] = args.ods_entity + if args.ods_key_prefix: + ods_args["key_prefix"] = args.ods_key_prefix + db_bench_runner = bench_runner_class(args.benchrunner_pos_args, ods_args) + # initialise the database configuration + db_options = DatabaseOptions(args.rocksdb_options, args.misc_options) + # set the frequency at which stats are dumped in the LOG file and the + # location of the LOG file. + db_log_dump_settings = { + "DBOptions.stats_dump_period_sec": {NO_COL_FAMILY: args.stats_dump_period_sec} + } + db_options.update_options(db_log_dump_settings) + # initialise the configuration optimizer + config_optimizer = ConfigOptimizer( + db_bench_runner, db_options, rule_spec_parser, args.base_db_path + ) + # run the optimiser to improve the database configuration for given + # benchmarks, with the help of expert-specified rules + final_db_options = config_optimizer.run() + # generate the final rocksdb options file + print( + "Final configuration in: " + final_db_options.generate_options_config("final") + ) + print("Final miscellaneous options: " + repr(final_db_options.get_misc_options())) + + +if __name__ == "__main__": + """ + An example run of this tool from the command-line would look like: + python3 -m advisor.config_optimizer_example + --base_db_path=/tmp/rocksdbtest-155919/dbbench + --rocksdb_options=temp/OPTIONS_boot.tmp --misc_options bloom_bits=2 + --rules_spec=advisor/rules.ini --stats_dump_period_sec=20 + --benchrunner_module=advisor.db_bench_runner + --benchrunner_class=DBBenchRunner --benchrunner_pos_args ./../../db_bench + readwhilewriting use_existing_db=true duration=90 + """ + parser = argparse.ArgumentParser( + description="This script is used for\ + searching for a better database configuration" + ) + parser.add_argument( + "--rocksdb_options", + required=True, + type=str, + help="path of the starting Rocksdb OPTIONS file", + ) + # these are options that are column-family agnostic and are not yet + # supported by the Rocksdb Options file: eg. bloom_bits=2 + parser.add_argument( + "--misc_options", + nargs="*", + help="whitespace-separated list of options that are not supported " + + "by the Rocksdb OPTIONS file, given in the " + + '<option_name>=<option_value> format eg. "bloom_bits=2 ' + + 'rate_limiter_bytes_per_sec=128000000"', + ) + parser.add_argument( + "--base_db_path", required=True, type=str, help="path for the Rocksdb database" + ) + parser.add_argument( + "--rules_spec", + required=True, + type=str, + help="path of the file containing the expert-specified Rules", + ) + parser.add_argument( + "--stats_dump_period_sec", + required=True, + type=int, + help="the frequency (in seconds) at which STATISTICS are printed to " + + "the Rocksdb LOG file", + ) + # ODS arguments + parser.add_argument("--ods_client", type=str, help="the ODS client binary") + parser.add_argument( + "--ods_entity", + type=str, + help="the servers for which the ODS stats need to be fetched", + ) + parser.add_argument( + "--ods_key_prefix", + type=str, + help="the prefix that needs to be attached to the keys of time " + + "series to be fetched from ODS", + ) + # benchrunner_module example: advisor.db_benchmark_client + parser.add_argument( + "--benchrunner_module", + required=True, + type=str, + help="the module containing the BenchmarkRunner class to be used by " + + "the Optimizer, example: advisor.db_bench_runner", + ) + # benchrunner_class example: DBBenchRunner + parser.add_argument( + "--benchrunner_class", + required=True, + type=str, + help="the name of the BenchmarkRunner class to be used by the " + + "Optimizer, should be present in the module provided in the " + + "benchrunner_module argument, example: DBBenchRunner", + ) + parser.add_argument( + "--benchrunner_pos_args", + nargs="*", + help="whitespace-separated positional arguments that are passed on " + + "to the constructor of the BenchmarkRunner class provided in the " + + 'benchrunner_class argument, example: "use_existing_db=true ' + + 'duration=900"', + ) + args = parser.parse_args() + main(args) diff --git a/src/rocksdb/tools/advisor/advisor/db_bench_runner.py b/src/rocksdb/tools/advisor/advisor/db_bench_runner.py new file mode 100644 index 000000000..f5802ed15 --- /dev/null +++ b/src/rocksdb/tools/advisor/advisor/db_bench_runner.py @@ -0,0 +1,237 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import shutil +import subprocess +import time + +from advisor.bench_runner import BenchmarkRunner +from advisor.db_log_parser import DatabaseLogs, DataSource, NO_COL_FAMILY +from advisor.db_stats_fetcher import ( + DatabasePerfContext, + LogStatsParser, + OdsStatsFetcher, +) + + +""" +NOTE: This is not thread-safe, because the output file is simply overwritten. +""" + + +class DBBenchRunner(BenchmarkRunner): + OUTPUT_FILE = "temp/dbbench_out.tmp" + ERROR_FILE = "temp/dbbench_err.tmp" + DB_PATH = "DB path" + THROUGHPUT = "ops/sec" + PERF_CON = " PERF_CONTEXT:" + + @staticmethod + def is_metric_better(new_metric, old_metric): + # for db_bench 'throughput' is the metric returned by run_experiment + return new_metric >= old_metric + + @staticmethod + def get_opt_args_str(misc_options_dict): + # given a dictionary of options and their values, return a string + # that can be appended as command-line arguments + optional_args_str = "" + for option_name, option_value in misc_options_dict.items(): + if option_value: + optional_args_str += " --" + option_name + "=" + str(option_value) + return optional_args_str + + def __init__(self, positional_args, ods_args=None): + # parse positional_args list appropriately + self.db_bench_binary = positional_args[0] + self.benchmark = positional_args[1] + self.db_bench_args = None + if len(positional_args) > 2: + # options list with each option given as "<option>=<value>" + self.db_bench_args = positional_args[2:] + # save ods_args, if provided + self.ods_args = ods_args + + def _parse_output(self, get_perf_context=False): + """ + Sample db_bench output after running 'readwhilewriting' benchmark: + DB path: [/tmp/rocksdbtest-155919/dbbench]\n + readwhilewriting : 16.582 micros/op 60305 ops/sec; 4.2 MB/s (3433828\ + of 5427999 found)\n + PERF_CONTEXT:\n + user_key_comparison_count = 500466712, block_cache_hit_count = ...\n + """ + output = {self.THROUGHPUT: None, self.DB_PATH: None, self.PERF_CON: None} + perf_context_begins = False + with open(self.OUTPUT_FILE, "r") as fp: + for line in fp: + if line.startswith(self.benchmark): + # line from sample output: + # readwhilewriting : 16.582 micros/op 60305 ops/sec; \ + # 4.2 MB/s (3433828 of 5427999 found)\n + print(line) # print output of the benchmark run + token_list = line.strip().split() + for ix, token in enumerate(token_list): + if token.startswith(self.THROUGHPUT): + # in above example, throughput = 60305 ops/sec + output[self.THROUGHPUT] = float(token_list[ix - 1]) + break + elif get_perf_context and line.startswith(self.PERF_CON): + # the following lines in the output contain perf context + # statistics (refer example above) + perf_context_begins = True + elif get_perf_context and perf_context_begins: + # Sample perf_context output: + # user_key_comparison_count = 500, block_cache_hit_count =\ + # 468, block_read_count = 580, block_read_byte = 445, ... + token_list = line.strip().split(",") + # token_list = ['user_key_comparison_count = 500', + # 'block_cache_hit_count = 468','block_read_count = 580'... + perf_context = { + tk.split("=")[0].strip(): tk.split("=")[1].strip() + for tk in token_list + if tk + } + # TODO(poojam23): this is a hack and should be replaced + # with the timestamp that db_bench will provide per printed + # perf_context + timestamp = int(time.time()) + perf_context_ts = {} + for stat in perf_context.keys(): + perf_context_ts[stat] = {timestamp: int(perf_context[stat])} + output[self.PERF_CON] = perf_context_ts + perf_context_begins = False + elif line.startswith(self.DB_PATH): + # line from sample output: + # DB path: [/tmp/rocksdbtest-155919/dbbench]\n + output[self.DB_PATH] = line.split("[")[1].split("]")[0] + return output + + def get_log_options(self, db_options, db_path): + # get the location of the LOG file and the frequency at which stats are + # dumped in the LOG file + log_dir_path = None + stats_freq_sec = None + logs_file_prefix = None + + # fetch frequency at which the stats are dumped in the Rocksdb logs + dump_period = "DBOptions.stats_dump_period_sec" + # fetch the directory, if specified, in which the Rocksdb logs are + # dumped, by default logs are dumped in same location as database + log_dir = "DBOptions.db_log_dir" + log_options = db_options.get_options([dump_period, log_dir]) + if dump_period in log_options: + stats_freq_sec = int(log_options[dump_period][NO_COL_FAMILY]) + if log_dir in log_options: + log_dir_path = log_options[log_dir][NO_COL_FAMILY] + + log_file_name = DBBenchRunner.get_info_log_file_name(log_dir_path, db_path) + + if not log_dir_path: + log_dir_path = db_path + if not log_dir_path.endswith("/"): + log_dir_path += "/" + + logs_file_prefix = log_dir_path + log_file_name + return (logs_file_prefix, stats_freq_sec) + + def _get_options_command_line_args_str(self, curr_options): + """ + This method uses the provided Rocksdb OPTIONS to create a string of + command-line arguments for db_bench. + The --options_file argument is always given and the options that are + not supported by the OPTIONS file are given as separate arguments. + """ + optional_args_str = DBBenchRunner.get_opt_args_str( + curr_options.get_misc_options() + ) + # generate an options configuration file + options_file = curr_options.generate_options_config(nonce="12345") + optional_args_str += " --options_file=" + options_file + return optional_args_str + + def _setup_db_before_experiment(self, curr_options, db_path): + # remove destination directory if it already exists + try: + shutil.rmtree(db_path, ignore_errors=True) + except OSError as e: + print("Error: rmdir " + e.filename + " " + e.strerror) + # setup database with a million keys using the fillrandom benchmark + command = "%s --benchmarks=fillrandom --db=%s --num=1000000" % ( + self.db_bench_binary, + db_path, + ) + args_str = self._get_options_command_line_args_str(curr_options) + command += args_str + self._run_command(command) + + def _build_experiment_command(self, curr_options, db_path): + command = "%s --benchmarks=%s --statistics --perf_level=3 --db=%s" % ( + self.db_bench_binary, + self.benchmark, + db_path, + ) + # fetch the command-line arguments string for providing Rocksdb options + args_str = self._get_options_command_line_args_str(curr_options) + # handle the command-line args passed in the constructor, these + # arguments are specific to db_bench + for cmd_line_arg in self.db_bench_args: + args_str += " --" + cmd_line_arg + command += args_str + return command + + def _run_command(self, command): + out_file = open(self.OUTPUT_FILE, "w+") + err_file = open(self.ERROR_FILE, "w+") + print("executing... - " + command) + subprocess.call(command, shell=True, stdout=out_file, stderr=err_file) + out_file.close() + err_file.close() + + def run_experiment(self, db_options, db_path): + # setup the Rocksdb database before running experiment + self._setup_db_before_experiment(db_options, db_path) + # get the command to run the experiment + command = self._build_experiment_command(db_options, db_path) + experiment_start_time = int(time.time()) + # run experiment + self._run_command(command) + experiment_end_time = int(time.time()) + # parse the db_bench experiment output + parsed_output = self._parse_output(get_perf_context=True) + + # get the log files path prefix and frequency at which Rocksdb stats + # are dumped in the logs + logs_file_prefix, stats_freq_sec = self.get_log_options( + db_options, parsed_output[self.DB_PATH] + ) + # create the Rocksbd LOGS object + db_logs = DatabaseLogs(logs_file_prefix, db_options.get_column_families()) + # Create the Log STATS object + db_log_stats = LogStatsParser(logs_file_prefix, stats_freq_sec) + # Create the PerfContext STATS object + db_perf_context = DatabasePerfContext(parsed_output[self.PERF_CON], 0, False) + # create the data-sources dictionary + data_sources = { + DataSource.Type.DB_OPTIONS: [db_options], + DataSource.Type.LOG: [db_logs], + DataSource.Type.TIME_SERIES: [db_log_stats, db_perf_context], + } + # Create the ODS STATS object + if self.ods_args: + key_prefix = "" + if "key_prefix" in self.ods_args: + key_prefix = self.ods_args["key_prefix"] + data_sources[DataSource.Type.TIME_SERIES].append( + OdsStatsFetcher( + self.ods_args["client_script"], + self.ods_args["entity"], + experiment_start_time, + experiment_end_time, + key_prefix, + ) + ) + # return the experiment's data-sources and throughput + return data_sources, parsed_output[self.THROUGHPUT] diff --git a/src/rocksdb/tools/advisor/advisor/db_config_optimizer.py b/src/rocksdb/tools/advisor/advisor/db_config_optimizer.py new file mode 100644 index 000000000..413778478 --- /dev/null +++ b/src/rocksdb/tools/advisor/advisor/db_config_optimizer.py @@ -0,0 +1,293 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import copy +import random + +from advisor.db_log_parser import NO_COL_FAMILY +from advisor.db_options_parser import DatabaseOptions +from advisor.rule_parser import Suggestion + + +class ConfigOptimizer: + SCOPE = "scope" + SUGG_VAL = "suggested values" + + @staticmethod + def apply_action_on_value(old_value, action, suggested_values): + chosen_sugg_val = None + if suggested_values: + chosen_sugg_val = random.choice(list(suggested_values)) + new_value = None + if action is Suggestion.Action.set or not old_value: + assert chosen_sugg_val + new_value = chosen_sugg_val + else: + # For increase/decrease actions, currently the code tries to make + # a 30% change in the option's value per iteration. An addend is + # also present (+1 or -1) to handle the cases when the option's + # old value was 0 or the final int() conversion suppressed the 30% + # change made to the option + old_value = float(old_value) + mul = 0 + add = 0 + if action is Suggestion.Action.increase: + if old_value < 0: + mul = 0.7 + add = 2 + else: + mul = 1.3 + add = 2 + elif action is Suggestion.Action.decrease: + if old_value < 0: + mul = 1.3 + add = -2 + else: + mul = 0.7 + add = -2 + new_value = int(old_value * mul + add) + return new_value + + @staticmethod + def improve_db_config(options, rule, suggestions_dict): + # this method takes ONE 'rule' and applies all its suggestions on the + # appropriate options + required_options = [] + rule_suggestions = [] + for sugg_name in rule.get_suggestions(): + option = suggestions_dict[sugg_name].option + action = suggestions_dict[sugg_name].action + # A Suggestion in the rules spec must have the 'option' and + # 'action' fields defined, always call perform_checks() method + # after parsing the rules file using RulesSpec + assert option + assert action + required_options.append(option) + rule_suggestions.append(suggestions_dict[sugg_name]) + current_config = options.get_options(required_options) + # Create the updated configuration from the rule's suggestions + updated_config = {} + for sugg in rule_suggestions: + # case: when the option is not present in the current configuration + if sugg.option not in current_config: + try: + new_value = ConfigOptimizer.apply_action_on_value( + None, sugg.action, sugg.suggested_values + ) + if sugg.option not in updated_config: + updated_config[sugg.option] = {} + if DatabaseOptions.is_misc_option(sugg.option): + # this suggestion is on an option that is not yet + # supported by the Rocksdb OPTIONS file and so it is + # not prefixed by a section type. + updated_config[sugg.option][NO_COL_FAMILY] = new_value + else: + for col_fam in rule.get_trigger_column_families(): + updated_config[sugg.option][col_fam] = new_value + except AssertionError: + print( + "WARNING(ConfigOptimizer): provide suggested_values " + + "for " + + sugg.option + ) + continue + # case: when the option is present in the current configuration + if NO_COL_FAMILY in current_config[sugg.option]: + old_value = current_config[sugg.option][NO_COL_FAMILY] + try: + new_value = ConfigOptimizer.apply_action_on_value( + old_value, sugg.action, sugg.suggested_values + ) + if sugg.option not in updated_config: + updated_config[sugg.option] = {} + updated_config[sugg.option][NO_COL_FAMILY] = new_value + except AssertionError: + print( + "WARNING(ConfigOptimizer): provide suggested_values " + + "for " + + sugg.option + ) + else: + for col_fam in rule.get_trigger_column_families(): + old_value = None + if col_fam in current_config[sugg.option]: + old_value = current_config[sugg.option][col_fam] + try: + new_value = ConfigOptimizer.apply_action_on_value( + old_value, sugg.action, sugg.suggested_values + ) + if sugg.option not in updated_config: + updated_config[sugg.option] = {} + updated_config[sugg.option][col_fam] = new_value + except AssertionError: + print( + "WARNING(ConfigOptimizer): provide " + + "suggested_values for " + + sugg.option + ) + return current_config, updated_config + + @staticmethod + def pick_rule_to_apply(rules, last_rule_name, rules_tried, backtrack): + if not rules: + print("\nNo more rules triggered!") + return None + # if the last rule provided an improvement in the database performance, + # and it was triggered again (i.e. it is present in 'rules'), then pick + # the same rule for this iteration too. + if last_rule_name and not backtrack: + for rule in rules: + if rule.name == last_rule_name: + return rule + # there was no previous rule OR the previous rule did not improve db + # performance OR it was not triggered for this iteration, + # then pick another rule that has not been tried yet + for rule in rules: + if rule.name not in rules_tried: + return rule + print("\nAll rules have been exhausted") + return None + + @staticmethod + def apply_suggestions( + triggered_rules, + current_rule_name, + rules_tried, + backtrack, + curr_options, + suggestions_dict, + ): + curr_rule = ConfigOptimizer.pick_rule_to_apply( + triggered_rules, current_rule_name, rules_tried, backtrack + ) + if not curr_rule: + return tuple([None] * 4) + # if a rule has been picked for improving db_config, update rules_tried + rules_tried.add(curr_rule.name) + # get updated config based on the picked rule + curr_conf, updated_conf = ConfigOptimizer.improve_db_config( + curr_options, curr_rule, suggestions_dict + ) + conf_diff = DatabaseOptions.get_options_diff(curr_conf, updated_conf) + if not conf_diff: # the current and updated configs are the same + ( + curr_rule, + rules_tried, + curr_conf, + updated_conf, + ) = ConfigOptimizer.apply_suggestions( + triggered_rules, + None, + rules_tried, + backtrack, + curr_options, + suggestions_dict, + ) + print("returning from apply_suggestions") + return (curr_rule, rules_tried, curr_conf, updated_conf) + + # TODO(poojam23): check if this method is required or can we directly set + # the config equal to the curr_config + @staticmethod + def get_backtrack_config(curr_config, updated_config): + diff = DatabaseOptions.get_options_diff(curr_config, updated_config) + bt_config = {} + for option in diff: + bt_config[option] = {} + for col_fam in diff[option]: + bt_config[option][col_fam] = diff[option][col_fam][0] + print(bt_config) + return bt_config + + def __init__(self, bench_runner, db_options, rule_parser, base_db): + self.bench_runner = bench_runner + self.db_options = db_options + self.rule_parser = rule_parser + self.base_db_path = base_db + + def run(self): + # In every iteration of this method's optimization loop we pick ONE + # RULE from all the triggered rules and apply all its suggestions to + # the appropriate options. + # bootstrapping the optimizer + print("Bootstrapping optimizer:") + options = copy.deepcopy(self.db_options) + old_data_sources, old_metric = self.bench_runner.run_experiment( + options, self.base_db_path + ) + print("Initial metric: " + str(old_metric)) + self.rule_parser.load_rules_from_spec() + self.rule_parser.perform_section_checks() + triggered_rules = self.rule_parser.get_triggered_rules( + old_data_sources, options.get_column_families() + ) + print("\nTriggered:") + self.rule_parser.print_rules(triggered_rules) + backtrack = False + rules_tried = set() + ( + curr_rule, + rules_tried, + curr_conf, + updated_conf, + ) = ConfigOptimizer.apply_suggestions( + triggered_rules, + None, + rules_tried, + backtrack, + options, + self.rule_parser.get_suggestions_dict(), + ) + # the optimizer loop + while curr_rule: + print("\nRule picked for next iteration:") + print(curr_rule.name) + print("\ncurrent config:") + print(curr_conf) + print("updated config:") + print(updated_conf) + options.update_options(updated_conf) + # run bench_runner with updated config + new_data_sources, new_metric = self.bench_runner.run_experiment( + options, self.base_db_path + ) + print("\nnew metric: " + str(new_metric)) + backtrack = not self.bench_runner.is_metric_better(new_metric, old_metric) + # update triggered_rules, metric, data_sources, if required + if backtrack: + # revert changes to options config + print("\nBacktracking to previous configuration") + backtrack_conf = ConfigOptimizer.get_backtrack_config( + curr_conf, updated_conf + ) + options.update_options(backtrack_conf) + else: + # run advisor on new data sources + self.rule_parser.load_rules_from_spec() # reboot the advisor + self.rule_parser.perform_section_checks() + triggered_rules = self.rule_parser.get_triggered_rules( + new_data_sources, options.get_column_families() + ) + print("\nTriggered:") + self.rule_parser.print_rules(triggered_rules) + old_metric = new_metric + old_data_sources = new_data_sources + rules_tried = set() + # pick rule to work on and set curr_rule to that + ( + curr_rule, + rules_tried, + curr_conf, + updated_conf, + ) = ConfigOptimizer.apply_suggestions( + triggered_rules, + curr_rule.name, + rules_tried, + backtrack, + options, + self.rule_parser.get_suggestions_dict(), + ) + # return the final database options configuration + return options diff --git a/src/rocksdb/tools/advisor/advisor/db_log_parser.py b/src/rocksdb/tools/advisor/advisor/db_log_parser.py new file mode 100644 index 000000000..9ba541fc3 --- /dev/null +++ b/src/rocksdb/tools/advisor/advisor/db_log_parser.py @@ -0,0 +1,134 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import glob +import re +import time +from abc import ABC, abstractmethod +from calendar import timegm +from enum import Enum + + +NO_COL_FAMILY = "DB_WIDE" + + +class DataSource(ABC): + class Type(Enum): + LOG = 1 + DB_OPTIONS = 2 + TIME_SERIES = 3 + + def __init__(self, type): + self.type = type + + @abstractmethod + def check_and_trigger_conditions(self, conditions): + pass + + +class Log: + @staticmethod + def is_new_log(log_line): + # The assumption is that a new log will start with a date printed in + # the below regex format. + date_regex = "\d{4}/\d{2}/\d{2}-\d{2}:\d{2}:\d{2}\.\d{6}" # noqa + return re.match(date_regex, log_line) + + def __init__(self, log_line, column_families): + token_list = log_line.strip().split() + self.time = token_list[0] + self.context = token_list[1] + self.message = " ".join(token_list[2:]) + self.column_family = None + # example log for 'default' column family: + # "2018/07/25-17:29:05.176080 7f969de68700 [db/compaction_job.cc:1634] + # [default] [JOB 3] Compacting 24@0 + 16@1 files to L1, score 6.00\n" + for col_fam in column_families: + search_for_str = "\[" + col_fam + "\]" # noqa + if re.search(search_for_str, self.message): + self.column_family = col_fam + break + if not self.column_family: + self.column_family = NO_COL_FAMILY + + def get_human_readable_time(self): + # example from a log line: '2018/07/25-11:25:45.782710' + return self.time + + def get_column_family(self): + return self.column_family + + def get_context(self): + return self.context + + def get_message(self): + return self.message + + def append_message(self, remaining_log): + self.message = self.message + "\n" + remaining_log.strip() + + def get_timestamp(self): + # example: '2018/07/25-11:25:45.782710' will be converted to the GMT + # Unix timestamp 1532517945 (note: this method assumes that self.time + # is in GMT) + hr_time = self.time + "GMT" + timestamp = timegm(time.strptime(hr_time, "%Y/%m/%d-%H:%M:%S.%f%Z")) + return timestamp + + def __repr__(self): + return ( + "time: " + + self.time + + "; context: " + + self.context + + "; col_fam: " + + self.column_family + + "; message: " + + self.message + ) + + +class DatabaseLogs(DataSource): + def __init__(self, logs_path_prefix, column_families): + super().__init__(DataSource.Type.LOG) + self.logs_path_prefix = logs_path_prefix + self.column_families = column_families + + def trigger_conditions_for_log(self, conditions, log): + # For a LogCondition object, trigger is: + # Dict[column_family_name, List[Log]]. This explains why the condition + # was triggered and for which column families. + for cond in conditions: + if re.search(cond.regex, log.get_message(), re.IGNORECASE): + trigger = cond.get_trigger() + if not trigger: + trigger = {} + if log.get_column_family() not in trigger: + trigger[log.get_column_family()] = [] + trigger[log.get_column_family()].append(log) + cond.set_trigger(trigger) + + def check_and_trigger_conditions(self, conditions): + for file_name in glob.glob(self.logs_path_prefix + "*"): + # TODO(poojam23): find a way to distinguish between log files + # - generated in the current experiment but are labeled 'old' + # because they LOGs exceeded the file size limit AND + # - generated in some previous experiment that are also labeled + # 'old' and were not deleted for some reason + if re.search("old", file_name, re.IGNORECASE): + continue + with open(file_name, "r") as db_logs: + new_log = None + for line in db_logs: + if Log.is_new_log(line): + if new_log: + self.trigger_conditions_for_log(conditions, new_log) + new_log = Log(line, self.column_families) + else: + # To account for logs split into multiple lines + new_log.append_message(line) + # Check for the last log in the file. + if new_log: + self.trigger_conditions_for_log(conditions, new_log) diff --git a/src/rocksdb/tools/advisor/advisor/db_options_parser.py b/src/rocksdb/tools/advisor/advisor/db_options_parser.py new file mode 100644 index 000000000..062aeeec4 --- /dev/null +++ b/src/rocksdb/tools/advisor/advisor/db_options_parser.py @@ -0,0 +1,348 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import copy +import os + +from advisor.db_log_parser import DataSource, NO_COL_FAMILY +from advisor.ini_parser import IniParser + + +class OptionsSpecParser(IniParser): + @staticmethod + def is_new_option(line): + return "=" in line + + @staticmethod + def get_section_type(line): + """ + Example section header: [TableOptions/BlockBasedTable "default"] + Here ConfigurationOptimizer returned would be + 'TableOptions.BlockBasedTable' + """ + section_path = line.strip()[1:-1].split()[0] + section_type = ".".join(section_path.split("/")) + return section_type + + @staticmethod + def get_section_name(line): + # example: get_section_name('[CFOptions "default"]') + token_list = line.strip()[1:-1].split('"') + # token_list = ['CFOptions', 'default', ''] + if len(token_list) < 3: + return None + return token_list[1] # return 'default' + + @staticmethod + def get_section_str(section_type, section_name): + # Example: + # Case 1: get_section_str('DBOptions', NO_COL_FAMILY) + # Case 2: get_section_str('TableOptions.BlockBasedTable', 'default') + section_type = "/".join(section_type.strip().split(".")) + # Case 1: section_type = 'DBOptions' + # Case 2: section_type = 'TableOptions/BlockBasedTable' + section_str = "[" + section_type + if section_name == NO_COL_FAMILY: + # Case 1: '[DBOptions]' + return section_str + "]" + else: + # Case 2: '[TableOptions/BlockBasedTable "default"]' + return section_str + ' "' + section_name + '"]' + + @staticmethod + def get_option_str(key, values): + option_str = key + "=" + # get_option_str('db_log_dir', None), returns 'db_log_dir=' + if values: + # example: + # get_option_str('max_bytes_for_level_multiplier_additional', + # [1,1,1,1,1,1,1]), returned string: + # 'max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1' + if isinstance(values, list): + for value in values: + option_str += str(value) + ":" + option_str = option_str[:-1] + else: + # example: get_option_str('write_buffer_size', 1048576) + # returned string: 'write_buffer_size=1048576' + option_str += str(values) + return option_str + + +class DatabaseOptions(DataSource): + @staticmethod + def is_misc_option(option_name): + # these are miscellaneous options that are not yet supported by the + # Rocksdb options file, hence they are not prefixed with any section + # name + return "." not in option_name + + @staticmethod + def get_options_diff(opt_old, opt_new): + # type: Dict[option, Dict[col_fam, value]] X 2 -> + # Dict[option, Dict[col_fam, Tuple(old_value, new_value)]] + # note: diff should contain a tuple of values only if they are + # different from each other + options_union = set(opt_old.keys()).union(set(opt_new.keys())) + diff = {} + for opt in options_union: + diff[opt] = {} + # if option in options_union, then it must be in one of the configs + if opt not in opt_old: + for col_fam in opt_new[opt]: + diff[opt][col_fam] = (None, opt_new[opt][col_fam]) + elif opt not in opt_new: + for col_fam in opt_old[opt]: + diff[opt][col_fam] = (opt_old[opt][col_fam], None) + else: + for col_fam in opt_old[opt]: + if col_fam in opt_new[opt]: + if opt_old[opt][col_fam] != opt_new[opt][col_fam]: + diff[opt][col_fam] = ( + opt_old[opt][col_fam], + opt_new[opt][col_fam], + ) + else: + diff[opt][col_fam] = (opt_old[opt][col_fam], None) + for col_fam in opt_new[opt]: + if col_fam in opt_old[opt]: + if opt_old[opt][col_fam] != opt_new[opt][col_fam]: + diff[opt][col_fam] = ( + opt_old[opt][col_fam], + opt_new[opt][col_fam], + ) + else: + diff[opt][col_fam] = (None, opt_new[opt][col_fam]) + if not diff[opt]: + diff.pop(opt) + return diff + + def __init__(self, rocksdb_options, misc_options=None): + super().__init__(DataSource.Type.DB_OPTIONS) + # The options are stored in the following data structure: + # Dict[section_type, Dict[section_name, Dict[option_name, value]]] + self.options_dict = None + self.column_families = None + # Load the options from the given file to a dictionary. + self.load_from_source(rocksdb_options) + # Setup the miscellaneous options expected to be List[str], where each + # element in the List has the format "<option_name>=<option_value>" + # These options are the ones that are not yet supported by the Rocksdb + # OPTIONS file, so they are provided separately + self.setup_misc_options(misc_options) + + def setup_misc_options(self, misc_options): + self.misc_options = {} + if misc_options: + for option_pair_str in misc_options: + option_name = option_pair_str.split("=")[0].strip() + option_value = option_pair_str.split("=")[1].strip() + self.misc_options[option_name] = option_value + + def load_from_source(self, options_path): + self.options_dict = {} + with open(options_path, "r") as db_options: + for line in db_options: + line = OptionsSpecParser.remove_trailing_comment(line) + if not line: + continue + if OptionsSpecParser.is_section_header(line): + curr_sec_type = OptionsSpecParser.get_section_type(line) + curr_sec_name = OptionsSpecParser.get_section_name(line) + if curr_sec_type not in self.options_dict: + self.options_dict[curr_sec_type] = {} + if not curr_sec_name: + curr_sec_name = NO_COL_FAMILY + self.options_dict[curr_sec_type][curr_sec_name] = {} + # example: if the line read from the Rocksdb OPTIONS file + # is [CFOptions "default"], then the section type is + # CFOptions and 'default' is the name of a column family + # that for this database, so it's added to the list of + # column families stored in this object + if curr_sec_type == "CFOptions": + if not self.column_families: + self.column_families = [] + self.column_families.append(curr_sec_name) + elif OptionsSpecParser.is_new_option(line): + key, value = OptionsSpecParser.get_key_value_pair(line) + self.options_dict[curr_sec_type][curr_sec_name][key] = value + else: + error = "Not able to parse line in Options file." + OptionsSpecParser.exit_with_parse_error(line, error) + + def get_misc_options(self): + # these are options that are not yet supported by the Rocksdb OPTIONS + # file, hence they are provided and stored separately + return self.misc_options + + def get_column_families(self): + return self.column_families + + def get_all_options(self): + # This method returns all the options that are stored in this object as + # a: Dict[<sec_type>.<option_name>: Dict[col_fam, option_value]] + all_options = [] + # Example: in the section header '[CFOptions "default"]' read from the + # OPTIONS file, sec_type='CFOptions' + for sec_type in self.options_dict: + for col_fam in self.options_dict[sec_type]: + for opt_name in self.options_dict[sec_type][col_fam]: + option = sec_type + "." + opt_name + all_options.append(option) + all_options.extend(list(self.misc_options.keys())) + return self.get_options(all_options) + + def get_options(self, reqd_options): + # type: List[str] -> Dict[str, Dict[str, Any]] + # List[option] -> Dict[option, Dict[col_fam, value]] + reqd_options_dict = {} + for option in reqd_options: + if DatabaseOptions.is_misc_option(option): + # the option is not prefixed by '<section_type>.' because it is + # not yet supported by the Rocksdb OPTIONS file; so it has to + # be fetched from the misc_options dictionary + if option not in self.misc_options: + continue + if option not in reqd_options_dict: + reqd_options_dict[option] = {} + reqd_options_dict[option][NO_COL_FAMILY] = self.misc_options[option] + else: + # Example: option = 'TableOptions.BlockBasedTable.block_align' + # then, sec_type = 'TableOptions.BlockBasedTable' + sec_type = ".".join(option.split(".")[:-1]) + # opt_name = 'block_align' + opt_name = option.split(".")[-1] + if sec_type not in self.options_dict: + continue + for col_fam in self.options_dict[sec_type]: + if opt_name in self.options_dict[sec_type][col_fam]: + if option not in reqd_options_dict: + reqd_options_dict[option] = {} + reqd_options_dict[option][col_fam] = self.options_dict[ + sec_type + ][col_fam][opt_name] + return reqd_options_dict + + def update_options(self, options): + # An example 'options' object looks like: + # {'DBOptions.max_background_jobs': {NO_COL_FAMILY: 2}, + # 'CFOptions.write_buffer_size': {'default': 1048576, 'cf_A': 128000}, + # 'bloom_bits': {NO_COL_FAMILY: 4}} + for option in options: + if DatabaseOptions.is_misc_option(option): + # this is a misc_option i.e. an option that is not yet + # supported by the Rocksdb OPTIONS file, so it is not prefixed + # by '<section_type>.' and must be stored in the separate + # misc_options dictionary + if NO_COL_FAMILY not in options[option]: + print( + "WARNING(DatabaseOptions.update_options): not " + + "updating option " + + option + + " because it is in " + + "misc_option format but its scope is not " + + NO_COL_FAMILY + + ". Check format of option." + ) + continue + self.misc_options[option] = options[option][NO_COL_FAMILY] + else: + sec_name = ".".join(option.split(".")[:-1]) + opt_name = option.split(".")[-1] + if sec_name not in self.options_dict: + self.options_dict[sec_name] = {} + for col_fam in options[option]: + # if the option is not already present in the dictionary, + # it will be inserted, else it will be updated to the new + # value + if col_fam not in self.options_dict[sec_name]: + self.options_dict[sec_name][col_fam] = {} + self.options_dict[sec_name][col_fam][opt_name] = copy.deepcopy( + options[option][col_fam] + ) + + def generate_options_config(self, nonce): + # this method generates a Rocksdb OPTIONS file in the INI format from + # the options stored in self.options_dict + this_path = os.path.abspath(os.path.dirname(__file__)) + file_name = "../temp/OPTIONS_" + str(nonce) + ".tmp" + file_path = os.path.join(this_path, file_name) + with open(file_path, "w") as fp: + for section in self.options_dict: + for col_fam in self.options_dict[section]: + fp.write(OptionsSpecParser.get_section_str(section, col_fam) + "\n") + for option in self.options_dict[section][col_fam]: + values = self.options_dict[section][col_fam][option] + fp.write( + OptionsSpecParser.get_option_str(option, values) + "\n" + ) + fp.write("\n") + return file_path + + def check_and_trigger_conditions(self, conditions): + for cond in conditions: + reqd_options_dict = self.get_options(cond.options) + # This contains the indices of options that are specific to some + # column family and are not database-wide options. + incomplete_option_ix = [] + options = [] + missing_reqd_option = False + for ix, option in enumerate(cond.options): + if option not in reqd_options_dict: + print( + "WARNING(DatabaseOptions.check_and_trigger): " + + "skipping condition " + + cond.name + + " because it " + "requires option " + + option + + " but this option is" + + " not available" + ) + missing_reqd_option = True + break # required option is absent + if NO_COL_FAMILY in reqd_options_dict[option]: + options.append(reqd_options_dict[option][NO_COL_FAMILY]) + else: + options.append(None) + incomplete_option_ix.append(ix) + + if missing_reqd_option: + continue + + # if all the options are database-wide options + if not incomplete_option_ix: + try: + if eval(cond.eval_expr): + cond.set_trigger({NO_COL_FAMILY: options}) + except Exception as e: + print("WARNING(DatabaseOptions) check_and_trigger:" + str(e)) + continue + + # for all the options that are not database-wide, we look for their + # values specific to column families + col_fam_options_dict = {} + for col_fam in self.column_families: + present = True + for ix in incomplete_option_ix: + option = cond.options[ix] + if col_fam not in reqd_options_dict[option]: + present = False + break + options[ix] = reqd_options_dict[option][col_fam] + if present: + try: + if eval(cond.eval_expr): + col_fam_options_dict[col_fam] = copy.deepcopy(options) + except Exception as e: + print("WARNING(DatabaseOptions) check_and_trigger: " + str(e)) + # Trigger for an OptionCondition object is of the form: + # Dict[col_fam_name: List[option_value]] + # where col_fam_name is the name of a column family for which + # 'eval_expr' evaluated to True and List[option_value] is the list + # of values of the options specified in the condition's 'options' + # field + if col_fam_options_dict: + cond.set_trigger(col_fam_options_dict) diff --git a/src/rocksdb/tools/advisor/advisor/db_stats_fetcher.py b/src/rocksdb/tools/advisor/advisor/db_stats_fetcher.py new file mode 100755 index 000000000..30d1ad8b3 --- /dev/null +++ b/src/rocksdb/tools/advisor/advisor/db_stats_fetcher.py @@ -0,0 +1,346 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import copy +import glob +import re +import subprocess +import time +from typing import List + +from advisor.db_log_parser import Log +from advisor.db_timeseries_parser import NO_ENTITY, TimeSeriesData + + +class LogStatsParser(TimeSeriesData): + STATS = "STATISTICS:" + + @staticmethod + def parse_log_line_for_stats(log_line): + # Example stat line (from LOG file): + # "rocksdb.db.get.micros P50 : 8.4 P95 : 21.8 P99 : 33.9 P100 : 92.0\n" + token_list = log_line.strip().split() + # token_list = ['rocksdb.db.get.micros', 'P50', ':', '8.4', 'P95', ':', + # '21.8', 'P99', ':', '33.9', 'P100', ':', '92.0'] + stat_prefix = token_list[0] + "." # 'rocksdb.db.get.micros.' + stat_values = [token for token in token_list[1:] if token != ":"] + # stat_values = ['P50', '8.4', 'P95', '21.8', 'P99', '33.9', 'P100', + # '92.0'] + stat_dict = {} + for ix, metric in enumerate(stat_values): + if ix % 2 == 0: + stat_name = stat_prefix + metric + stat_name = stat_name.lower() # Note: case insensitive names + else: + stat_dict[stat_name] = float(metric) + # stat_dict = {'rocksdb.db.get.micros.p50': 8.4, + # 'rocksdb.db.get.micros.p95': 21.8, 'rocksdb.db.get.micros.p99': 33.9, + # 'rocksdb.db.get.micros.p100': 92.0} + return stat_dict + + def __init__(self, logs_path_prefix, stats_freq_sec): + super().__init__() + self.logs_file_prefix = logs_path_prefix + self.stats_freq_sec = stats_freq_sec + self.duration_sec = 60 + + def get_keys_from_conditions(self, conditions): + # Note: case insensitive stat names + reqd_stats = [] + for cond in conditions: + for key in cond.keys: + key = key.lower() + # some keys are prepended with '[]' for OdsStatsFetcher to + # replace this with the appropriate key_prefix, remove these + # characters here since the LogStatsParser does not need + # a prefix + if key.startswith("[]"): + reqd_stats.append(key[2:]) + else: + reqd_stats.append(key) + return reqd_stats + + def add_to_timeseries(self, log, reqd_stats): + # this method takes in the Log object that contains the Rocksdb stats + # and a list of required stats, then it parses the stats line by line + # to fetch required stats and add them to the keys_ts object + # Example: reqd_stats = ['rocksdb.block.cache.hit.count', + # 'rocksdb.db.get.micros.p99'] + # Let log.get_message() returns following string: + # "[WARN] [db/db_impl.cc:485] STATISTICS:\n + # rocksdb.block.cache.miss COUNT : 1459\n + # rocksdb.block.cache.hit COUNT : 37\n + # ... + # rocksdb.db.get.micros P50 : 15.6 P95 : 39.7 P99 : 62.6 P100 : 148.0\n + # ..." + new_lines = log.get_message().split("\n") + # let log_ts = 1532518219 + log_ts = log.get_timestamp() + # example updates to keys_ts: + # keys_ts[NO_ENTITY]['rocksdb.db.get.micros.p99'][1532518219] = 62.6 + # keys_ts[NO_ENTITY]['rocksdb.block.cache.hit.count'][1532518219] = 37 + for line in new_lines[1:]: # new_lines[0] does not contain any stats + stats_on_line = self.parse_log_line_for_stats(line) + for stat in stats_on_line: + if stat in reqd_stats: + if stat not in self.keys_ts[NO_ENTITY]: + self.keys_ts[NO_ENTITY][stat] = {} + self.keys_ts[NO_ENTITY][stat][log_ts] = stats_on_line[stat] + + def fetch_timeseries(self, reqd_stats): + # this method parses the Rocksdb LOG file and generates timeseries for + # each of the statistic in the list reqd_stats + self.keys_ts = {NO_ENTITY: {}} + for file_name in glob.glob(self.logs_file_prefix + "*"): + # TODO(poojam23): find a way to distinguish between 'old' log files + # from current and previous experiments, present in the same + # directory + if re.search("old", file_name, re.IGNORECASE): + continue + with open(file_name, "r") as db_logs: + new_log = None + for line in db_logs: + if Log.is_new_log(line): + if new_log and re.search(self.STATS, new_log.get_message()): + self.add_to_timeseries(new_log, reqd_stats) + new_log = Log(line, column_families=[]) + else: + # To account for logs split into multiple lines + new_log.append_message(line) + # Check for the last log in the file. + if new_log and re.search(self.STATS, new_log.get_message()): + self.add_to_timeseries(new_log, reqd_stats) + + +class DatabasePerfContext(TimeSeriesData): + # TODO(poojam23): check if any benchrunner provides PerfContext sampled at + # regular intervals + def __init__(self, perf_context_ts, stats_freq_sec, cumulative): + """ + perf_context_ts is expected to be in the following format: + Dict[metric, Dict[timestamp, value]], where for + each (metric, timestamp) pair, the value is database-wide (i.e. + summed over all the threads involved) + if stats_freq_sec == 0, per-metric only one value is reported + """ + super().__init__() + self.stats_freq_sec = stats_freq_sec + self.keys_ts = {NO_ENTITY: perf_context_ts} + if cumulative: + self.unaccumulate_metrics() + + def unaccumulate_metrics(self): + # if the perf context metrics provided are cumulative in nature, this + # method can be used to convert them to a disjoint format + epoch_ts = copy.deepcopy(self.keys_ts) + for stat in self.keys_ts[NO_ENTITY]: + timeseries = sorted( + list(self.keys_ts[NO_ENTITY][stat].keys()), reverse=True + ) + if len(timeseries) < 2: + continue + for ix, ts in enumerate(timeseries[:-1]): + epoch_ts[NO_ENTITY][stat][ts] = ( + epoch_ts[NO_ENTITY][stat][ts] + - epoch_ts[NO_ENTITY][stat][timeseries[ix + 1]] + ) + if epoch_ts[NO_ENTITY][stat][ts] < 0: + raise ValueError("DBPerfContext: really cumulative?") + # drop the smallest timestamp in the timeseries for this metric + epoch_ts[NO_ENTITY][stat].pop(timeseries[-1]) + self.keys_ts = epoch_ts + + def get_keys_from_conditions(self, conditions): + reqd_stats = [] + for cond in conditions: + reqd_stats.extend([key.lower() for key in cond.keys]) + return reqd_stats + + def fetch_timeseries(self, statistics): + # this method is redundant for DatabasePerfContext because the __init__ + # does the job of populating 'keys_ts' + pass + + +class OdsStatsFetcher(TimeSeriesData): + # class constants + OUTPUT_FILE = "temp/stats_out.tmp" + ERROR_FILE = "temp/stats_err.tmp" + RAPIDO_COMMAND = "%s --entity=%s --key=%s --tstart=%s --tend=%s --showtime" + + # static methods + @staticmethod + def _get_string_in_quotes(value): + return '"' + str(value) + '"' + + @staticmethod + def _get_time_value_pair(pair_string): + # example pair_string: '[1532544591, 97.3653601828]' + pair_string = pair_string.replace("[", "") + pair_string = pair_string.replace("]", "") + pair = pair_string.split(",") + first = int(pair[0].strip()) + second = float(pair[1].strip()) + return [first, second] + + @staticmethod + def _get_ods_cli_stime(start_time): + diff = int(time.time() - int(start_time)) + stime = str(diff) + "_s" + return stime + + def __init__(self, client, entities, start_time, end_time, key_prefix=None): + super().__init__() + self.client = client + self.entities = entities + self.start_time = start_time + self.end_time = end_time + self.key_prefix = key_prefix + self.stats_freq_sec = 60 + self.duration_sec = 60 + + def execute_script(self, command): + print("executing...") + print(command) + out_file = open(self.OUTPUT_FILE, "w+") + err_file = open(self.ERROR_FILE, "w+") + subprocess.call(command, shell=True, stdout=out_file, stderr=err_file) + out_file.close() + err_file.close() + + def parse_rapido_output(self): + # Output looks like the following: + # <entity_name>\t<key_name>\t[[ts, value], [ts, value], ...] + # ts = timestamp; value = value of key_name in entity_name at time ts + self.keys_ts = {} + with open(self.OUTPUT_FILE, "r") as fp: + for line in fp: + token_list = line.strip().split("\t") + entity = token_list[0] + key = token_list[1] + if entity not in self.keys_ts: + self.keys_ts[entity] = {} + if key not in self.keys_ts[entity]: + self.keys_ts[entity][key] = {} + list_of_lists = [ + self._get_time_value_pair(pair_string) + for pair_string in token_list[2].split("],") + ] + value = {pair[0]: pair[1] for pair in list_of_lists} + self.keys_ts[entity][key] = value + + def parse_ods_output(self): + # Output looks like the following: + # <entity_name>\t<key_name>\t<timestamp>\t<value> + # there is one line per (entity_name, key_name, timestamp) + self.keys_ts = {} + with open(self.OUTPUT_FILE, "r") as fp: + for line in fp: + token_list = line.split() + entity = token_list[0] + if entity not in self.keys_ts: + self.keys_ts[entity] = {} + key = token_list[1] + if key not in self.keys_ts[entity]: + self.keys_ts[entity][key] = {} + self.keys_ts[entity][key][token_list[2]] = token_list[3] + + def fetch_timeseries(self, statistics): + # this method fetches the timeseries of required stats from the ODS + # service and populates the 'keys_ts' object appropriately + print("OdsStatsFetcher: fetching " + str(statistics)) + if re.search("rapido", self.client, re.IGNORECASE): + command = self.RAPIDO_COMMAND % ( + self.client, + self._get_string_in_quotes(self.entities), + self._get_string_in_quotes(",".join(statistics)), + self._get_string_in_quotes(self.start_time), + self._get_string_in_quotes(self.end_time), + ) + # Run the tool and fetch the time-series data + self.execute_script(command) + # Parse output and populate the 'keys_ts' map + self.parse_rapido_output() + elif re.search("ods", self.client, re.IGNORECASE): + command = ( + self.client + + " " + + "--stime=" + + self._get_ods_cli_stime(self.start_time) + + " " + + self._get_string_in_quotes(self.entities) + + " " + + self._get_string_in_quotes(",".join(statistics)) + ) + # Run the tool and fetch the time-series data + self.execute_script(command) + # Parse output and populate the 'keys_ts' map + self.parse_ods_output() + + def get_keys_from_conditions(self, conditions): + reqd_stats = [] + for cond in conditions: + for key in cond.keys: + use_prefix = False + if key.startswith("[]"): + use_prefix = True + key = key[2:] + # TODO(poojam23): this is very hacky and needs to be improved + if key.startswith("rocksdb"): + key += ".60" + if use_prefix: + if not self.key_prefix: + print("Warning: OdsStatsFetcher might need key prefix") + print("for the key: " + key) + else: + key = self.key_prefix + "." + key + reqd_stats.append(key) + return reqd_stats + + def fetch_rate_url( + self, + entities: List[str], + keys: List[str], + window_len: str, + percent: str, + display: bool, + ) -> str: + transform_desc = ( + "rate(" + str(window_len) + ",duration=" + str(self.duration_sec) + ) + if percent: + transform_desc = transform_desc + ",%)" + else: + transform_desc = transform_desc + ")" + if re.search("rapido", self.client, re.IGNORECASE): + command = self.RAPIDO_COMMAND + " --transform=%s --url=%s" + command = command % ( + self.client, + self._get_string_in_quotes(",".join(entities)), + self._get_string_in_quotes(",".join(keys)), + self._get_string_in_quotes(self.start_time), + self._get_string_in_quotes(self.end_time), + self._get_string_in_quotes(transform_desc), + self._get_string_in_quotes(display), + ) + elif re.search("ods", self.client, re.IGNORECASE): + command = ( + self.client + + " " + + "--stime=" + + self._get_ods_cli_stime(self.start_time) + + " " + + "--fburlonly " + + self._get_string_in_quotes(entities) + + " " + + self._get_string_in_quotes(",".join(keys)) + + " " + + self._get_string_in_quotes(transform_desc) + ) + self.execute_script(command) + url = "" + with open(self.OUTPUT_FILE, "r") as fp: + url = fp.readline() + return url diff --git a/src/rocksdb/tools/advisor/advisor/db_timeseries_parser.py b/src/rocksdb/tools/advisor/advisor/db_timeseries_parser.py new file mode 100644 index 000000000..5840d7b90 --- /dev/null +++ b/src/rocksdb/tools/advisor/advisor/db_timeseries_parser.py @@ -0,0 +1,203 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import math +from abc import abstractmethod +from enum import Enum +from typing import Dict + +from advisor.db_log_parser import DataSource + + +NO_ENTITY = "ENTITY_PLACEHOLDER" + + +class TimeSeriesData(DataSource): + class Behavior(Enum): + bursty = 1 + evaluate_expression = 2 + + class AggregationOperator(Enum): + avg = 1 + max = 2 + min = 3 + latest = 4 + oldest = 5 + + def __init__(self): + super().__init__(DataSource.Type.TIME_SERIES) + self.keys_ts = None # Dict[entity, Dict[key, Dict[timestamp, value]]] + self.stats_freq_sec = None + + @abstractmethod + def get_keys_from_conditions(self, conditions): + # This method takes in a list of time-series conditions; for each + # condition it manipulates the 'keys' in the way that is supported by + # the subclass implementing this method + pass + + @abstractmethod + def fetch_timeseries(self, required_statistics): + # this method takes in a list of statistics and fetches the timeseries + # for each of them and populates the 'keys_ts' dictionary + pass + + def fetch_burst_epochs( + self, + entities: str, + statistic: int, + window_sec: float, + threshold: bool, + percent: bool, + ) -> Dict[str, Dict[int, float]]: + # this method calculates the (percent) rate change in the 'statistic' + # for each entity (over 'window_sec' seconds) and returns the epochs + # where this rate change is greater than or equal to the 'threshold' + # value + if self.stats_freq_sec == 0: + # not time series data, cannot check for bursty behavior + return + if window_sec < self.stats_freq_sec: + window_sec = self.stats_freq_sec + # 'window_samples' is the number of windows to go back to + # compare the current window with, while calculating rate change. + window_samples = math.ceil(window_sec / self.stats_freq_sec) + burst_epochs = {} + # if percent = False: + # curr_val = value at window for which rate change is being calculated + # prev_val = value at window that is window_samples behind curr_window + # Then rate_without_percent = + # ((curr_val-prev_val)*duration_sec)/(curr_timestamp-prev_timestamp) + # if percent = True: + # rate_with_percent = (rate_without_percent * 100) / prev_val + # These calculations are in line with the rate() transform supported + # by ODS + for entity in entities: + if statistic not in self.keys_ts[entity]: + continue + timestamps = sorted(list(self.keys_ts[entity][statistic].keys())) + for ix in range(window_samples, len(timestamps), 1): + first_ts = timestamps[ix - window_samples] + last_ts = timestamps[ix] + first_val = self.keys_ts[entity][statistic][first_ts] + last_val = self.keys_ts[entity][statistic][last_ts] + diff = last_val - first_val + if percent: + diff = diff * 100 / first_val + rate = (diff * self.duration_sec) / (last_ts - first_ts) + # if the rate change is greater than the provided threshold, + # then the condition is triggered for entity at time 'last_ts' + if rate >= threshold: + if entity not in burst_epochs: + burst_epochs[entity] = {} + burst_epochs[entity][last_ts] = rate + return burst_epochs + + def fetch_aggregated_values(self, entity, statistics, aggregation_op): + # this method performs the aggregation specified by 'aggregation_op' + # on the timeseries of 'statistics' for 'entity' and returns: + # Dict[statistic, aggregated_value] + result = {} + for stat in statistics: + if stat not in self.keys_ts[entity]: + continue + agg_val = None + if aggregation_op is self.AggregationOperator.latest: + latest_timestamp = max(list(self.keys_ts[entity][stat].keys())) + agg_val = self.keys_ts[entity][stat][latest_timestamp] + elif aggregation_op is self.AggregationOperator.oldest: + oldest_timestamp = min(list(self.keys_ts[entity][stat].keys())) + agg_val = self.keys_ts[entity][stat][oldest_timestamp] + elif aggregation_op is self.AggregationOperator.max: + agg_val = max(list(self.keys_ts[entity][stat].values())) + elif aggregation_op is self.AggregationOperator.min: + agg_val = min(list(self.keys_ts[entity][stat].values())) + elif aggregation_op is self.AggregationOperator.avg: + values = list(self.keys_ts[entity][stat].values()) + agg_val = sum(values) / len(values) + result[stat] = agg_val + return result + + def check_and_trigger_conditions(self, conditions): + # get the list of statistics that need to be fetched + reqd_keys = self.get_keys_from_conditions(conditions) + # fetch the required statistics and populate the map 'keys_ts' + self.fetch_timeseries(reqd_keys) + # Trigger the appropriate conditions + for cond in conditions: + complete_keys = self.get_keys_from_conditions([cond]) + # Get the entities that have all statistics required by 'cond': + # an entity is checked for a given condition only if we possess all + # of the condition's 'keys' for that entity + entities_with_stats = [] + for entity in self.keys_ts: + stat_missing = False + for stat in complete_keys: + if stat not in self.keys_ts[entity]: + stat_missing = True + break + if not stat_missing: + entities_with_stats.append(entity) + if not entities_with_stats: + continue + if cond.behavior is self.Behavior.bursty: + # for a condition that checks for bursty behavior, only one key + # should be present in the condition's 'keys' field + result = self.fetch_burst_epochs( + entities_with_stats, + complete_keys[0], # there should be only one key + cond.window_sec, + cond.rate_threshold, + True, + ) + # Trigger in this case is: + # Dict[entity_name, Dict[timestamp, rate_change]] + # where the inner dictionary contains rate_change values when + # the rate_change >= threshold provided, with the + # corresponding timestamps + if result: + cond.set_trigger(result) + elif cond.behavior is self.Behavior.evaluate_expression: + self.handle_evaluate_expression( + cond, complete_keys, entities_with_stats + ) + + def handle_evaluate_expression(self, condition, statistics, entities): + trigger = {} + # check 'condition' for each of these entities + for entity in entities: + if hasattr(condition, "aggregation_op"): + # in this case, the aggregation operation is performed on each + # of the condition's 'keys' and then with aggregated values + # condition's 'expression' is evaluated; if it evaluates to + # True, then list of the keys values is added to the + # condition's trigger: Dict[entity_name, List[stats]] + result = self.fetch_aggregated_values( + entity, statistics, condition.aggregation_op + ) + keys = [result[key] for key in statistics] + try: + if eval(condition.expression): + trigger[entity] = keys + except Exception as e: + print("WARNING(TimeSeriesData) check_and_trigger: " + str(e)) + else: + # assumption: all stats have same series of timestamps + # this is similar to the above but 'expression' is evaluated at + # each timestamp, since there is no aggregation, and all the + # epochs are added to the trigger when the condition's + # 'expression' evaluated to true; so trigger is: + # Dict[entity, Dict[timestamp, List[stats]]] + for epoch in self.keys_ts[entity][statistics[0]].keys(): + keys = [self.keys_ts[entity][key][epoch] for key in statistics] + try: + if eval(condition.expression): + if entity not in trigger: + trigger[entity] = {} + trigger[entity][epoch] = keys + except Exception as e: + print("WARNING(TimeSeriesData) check_and_trigger: " + str(e)) + if trigger: + condition.set_trigger(trigger) diff --git a/src/rocksdb/tools/advisor/advisor/ini_parser.py b/src/rocksdb/tools/advisor/advisor/ini_parser.py new file mode 100644 index 000000000..3379ea3cd --- /dev/null +++ b/src/rocksdb/tools/advisor/advisor/ini_parser.py @@ -0,0 +1,76 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +from enum import Enum + + +class IniParser: + class Element(Enum): + rule = 1 + cond = 2 + sugg = 3 + key_val = 4 + comment = 5 + + @staticmethod + def remove_trailing_comment(line): + line = line.strip() + comment_start = line.find("#") + if comment_start > -1: + return line[:comment_start] + return line + + @staticmethod + def is_section_header(line): + # A section header looks like: [Rule "my-new-rule"]. Essentially, + # a line that is in square-brackets. + line = line.strip() + if line.startswith("[") and line.endswith("]"): + return True + return False + + @staticmethod + def get_section_name(line): + # For a section header: [Rule "my-new-rule"], this method will return + # "my-new-rule". + token_list = line.strip()[1:-1].split('"') + if len(token_list) < 3: + error = 'needed section header: [<section_type> "<section_name>"]' + raise ValueError("Parsing error: " + error + "\n" + line) + return token_list[1] + + @staticmethod + def get_element(line): + line = IniParser.remove_trailing_comment(line) + if not line: + return IniParser.Element.comment + if IniParser.is_section_header(line): + if line.strip()[1:-1].startswith("Suggestion"): + return IniParser.Element.sugg + if line.strip()[1:-1].startswith("Rule"): + return IniParser.Element.rule + if line.strip()[1:-1].startswith("Condition"): + return IniParser.Element.cond + if "=" in line: + return IniParser.Element.key_val + error = "not a recognizable RulesSpec element" + raise ValueError("Parsing error: " + error + "\n" + line) + + @staticmethod + def get_key_value_pair(line): + line = line.strip() + key = line.split("=")[0].strip() + value = "=".join(line.split("=")[1:]) + if value == "": # if the option has no value + return (key, None) + values = IniParser.get_list_from_value(value) + if len(values) == 1: + return (key, value) + return (key, values) + + @staticmethod + def get_list_from_value(value): + values = value.strip().split(":") + return values diff --git a/src/rocksdb/tools/advisor/advisor/rule_parser.py b/src/rocksdb/tools/advisor/advisor/rule_parser.py new file mode 100644 index 000000000..169a55363 --- /dev/null +++ b/src/rocksdb/tools/advisor/advisor/rule_parser.py @@ -0,0 +1,510 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import re +from abc import ABC, abstractmethod +from enum import Enum + +from advisor.db_log_parser import DataSource, NO_COL_FAMILY +from advisor.db_timeseries_parser import TimeSeriesData +from advisor.ini_parser import IniParser + + +class Section(ABC): + def __init__(self, name): + self.name = name + + @abstractmethod + def set_parameter(self, key, value): + pass + + @abstractmethod + def perform_checks(self): + pass + + +class Rule(Section): + def __init__(self, name): + super().__init__(name) + self.conditions = None + self.suggestions = None + self.overlap_time_seconds = None + self.trigger_entities = None + self.trigger_column_families = None + + def set_parameter(self, key, value): + # If the Rule is associated with a single suggestion/condition, then + # value will be a string and not a list. Hence, convert it to a single + # element list before storing it in self.suggestions or + # self.conditions. + if key == "conditions": + if isinstance(value, str): + self.conditions = [value] + else: + self.conditions = value + elif key == "suggestions": + if isinstance(value, str): + self.suggestions = [value] + else: + self.suggestions = value + elif key == "overlap_time_period": + self.overlap_time_seconds = value + + def get_suggestions(self): + return self.suggestions + + def perform_checks(self): + if not self.conditions or len(self.conditions) < 1: + raise ValueError(self.name + ": rule must have at least one condition") + if not self.suggestions or len(self.suggestions) < 1: + raise ValueError(self.name + ": rule must have at least one suggestion") + if self.overlap_time_seconds: + if len(self.conditions) != 2: + raise ValueError( + self.name + + ": rule must be associated with 2 conditions\ + in order to check for a time dependency between them" + ) + time_format = "^\d+[s|m|h|d]$" # noqa + if not re.match(time_format, self.overlap_time_seconds, re.IGNORECASE): + raise ValueError( + self.name + ": overlap_time_seconds format: \d+[s|m|h|d]" + ) + else: # convert to seconds + in_seconds = int(self.overlap_time_seconds[:-1]) + if self.overlap_time_seconds[-1] == "m": + in_seconds *= 60 + elif self.overlap_time_seconds[-1] == "h": + in_seconds *= 60 * 60 + elif self.overlap_time_seconds[-1] == "d": + in_seconds *= 24 * 60 * 60 + self.overlap_time_seconds = in_seconds + + def get_overlap_timestamps(self, key1_trigger_epochs, key2_trigger_epochs): + # this method takes in 2 timeseries i.e. timestamps at which the + # rule's 2 TIME_SERIES conditions were triggered and it finds + # (if present) the first pair of timestamps at which the 2 conditions + # were triggered within 'overlap_time_seconds' of each other + key1_lower_bounds = [ + epoch - self.overlap_time_seconds for epoch in key1_trigger_epochs + ] + key1_lower_bounds.sort() + key2_trigger_epochs.sort() + trigger_ix = 0 + overlap_pair = None + for key1_lb in key1_lower_bounds: + while key2_trigger_epochs[trigger_ix] < key1_lb and trigger_ix < len( + key2_trigger_epochs + ): + trigger_ix += 1 + if trigger_ix >= len(key2_trigger_epochs): + break + if key2_trigger_epochs[trigger_ix] <= key1_lb + ( + 2 * self.overlap_time_seconds + ): + overlap_pair = ( + key2_trigger_epochs[trigger_ix], + key1_lb + self.overlap_time_seconds, + ) + break + return overlap_pair + + def get_trigger_entities(self): + return self.trigger_entities + + def get_trigger_column_families(self): + return self.trigger_column_families + + def is_triggered(self, conditions_dict, column_families): + if self.overlap_time_seconds: + condition1 = conditions_dict[self.conditions[0]] + condition2 = conditions_dict[self.conditions[1]] + if not ( + condition1.get_data_source() is DataSource.Type.TIME_SERIES + and condition2.get_data_source() is DataSource.Type.TIME_SERIES + ): + raise ValueError(self.name + ": need 2 timeseries conditions") + + map1 = condition1.get_trigger() + map2 = condition2.get_trigger() + if not (map1 and map2): + return False + + self.trigger_entities = {} + is_triggered = False + entity_intersection = set(map1.keys()).intersection(set(map2.keys())) + for entity in entity_intersection: + overlap_timestamps_pair = self.get_overlap_timestamps( + list(map1[entity].keys()), list(map2[entity].keys()) + ) + if overlap_timestamps_pair: + self.trigger_entities[entity] = overlap_timestamps_pair + is_triggered = True + if is_triggered: + self.trigger_column_families = set(column_families) + return is_triggered + else: + all_conditions_triggered = True + self.trigger_column_families = set(column_families) + for cond_name in self.conditions: + cond = conditions_dict[cond_name] + if not cond.get_trigger(): + all_conditions_triggered = False + break + if ( + cond.get_data_source() is DataSource.Type.LOG + or cond.get_data_source() is DataSource.Type.DB_OPTIONS + ): + cond_col_fam = set(cond.get_trigger().keys()) + if NO_COL_FAMILY in cond_col_fam: + cond_col_fam = set(column_families) + self.trigger_column_families = ( + self.trigger_column_families.intersection(cond_col_fam) + ) + elif cond.get_data_source() is DataSource.Type.TIME_SERIES: + cond_entities = set(cond.get_trigger().keys()) + if self.trigger_entities is None: + self.trigger_entities = cond_entities + else: + self.trigger_entities = self.trigger_entities.intersection( + cond_entities + ) + if not (self.trigger_entities or self.trigger_column_families): + all_conditions_triggered = False + break + if not all_conditions_triggered: # clean up if rule not triggered + self.trigger_column_families = None + self.trigger_entities = None + return all_conditions_triggered + + def __repr__(self): + # Append conditions + rule_string = "Rule: " + self.name + " has conditions:: " + is_first = True + for cond in self.conditions: + if is_first: + rule_string += cond + is_first = False + else: + rule_string += " AND " + cond + # Append suggestions + rule_string += "\nsuggestions:: " + is_first = True + for sugg in self.suggestions: + if is_first: + rule_string += sugg + is_first = False + else: + rule_string += ", " + sugg + if self.trigger_entities: + rule_string += ", entities:: " + str(self.trigger_entities) + if self.trigger_column_families: + rule_string += ", col_fam:: " + str(self.trigger_column_families) + # Return constructed string + return rule_string + + +class Suggestion(Section): + class Action(Enum): + set = 1 + increase = 2 + decrease = 3 + + def __init__(self, name): + super().__init__(name) + self.option = None + self.action = None + self.suggested_values = None + self.description = None + + def set_parameter(self, key, value): + if key == "option": + # Note: + # case 1: 'option' is supported by Rocksdb OPTIONS file; in this + # case the option belongs to one of the sections in the config + # file and it's name is prefixed by "<section_type>." + # case 2: 'option' is not supported by Rocksdb OPTIONS file; the + # option is not expected to have the character '.' in its name + self.option = value + elif key == "action": + if self.option and not value: + raise ValueError(self.name + ": provide action for option") + self.action = self.Action[value] + elif key == "suggested_values": + if isinstance(value, str): + self.suggested_values = [value] + else: + self.suggested_values = value + elif key == "description": + self.description = value + + def perform_checks(self): + if not self.description: + if not self.option: + raise ValueError(self.name + ": provide option or description") + if not self.action: + raise ValueError(self.name + ": provide action for option") + if self.action is self.Action.set and not self.suggested_values: + raise ValueError(self.name + ": provide suggested value for option") + + def __repr__(self): + sugg_string = "Suggestion: " + self.name + if self.description: + sugg_string += " description : " + self.description + else: + sugg_string += " option : " + self.option + " action : " + self.action.name + if self.suggested_values: + sugg_string += " suggested_values : " + str(self.suggested_values) + return sugg_string + + +class Condition(Section): + def __init__(self, name): + super().__init__(name) + self.data_source = None + self.trigger = None + + def perform_checks(self): + if not self.data_source: + raise ValueError(self.name + ": condition not tied to data source") + + def set_data_source(self, data_source): + self.data_source = data_source + + def get_data_source(self): + return self.data_source + + def reset_trigger(self): + self.trigger = None + + def set_trigger(self, condition_trigger): + self.trigger = condition_trigger + + def get_trigger(self): + return self.trigger + + def is_triggered(self): + if self.trigger: + return True + return False + + def set_parameter(self, key, value): + # must be defined by the subclass + raise NotImplementedError(self.name + ": provide source for condition") + + +class LogCondition(Condition): + @classmethod + def create(cls, base_condition): + base_condition.set_data_source(DataSource.Type["LOG"]) + base_condition.__class__ = cls + return base_condition + + def set_parameter(self, key, value): + if key == "regex": + self.regex = value + + def perform_checks(self): + super().perform_checks() + if not self.regex: + raise ValueError(self.name + ": provide regex for log condition") + + def __repr__(self): + log_cond_str = "LogCondition: " + self.name + log_cond_str += " regex: " + self.regex + # if self.trigger: + # log_cond_str += (" trigger: " + str(self.trigger)) + return log_cond_str + + +class OptionCondition(Condition): + @classmethod + def create(cls, base_condition): + base_condition.set_data_source(DataSource.Type["DB_OPTIONS"]) + base_condition.__class__ = cls + return base_condition + + def set_parameter(self, key, value): + if key == "options": + if isinstance(value, str): + self.options = [value] + else: + self.options = value + elif key == "evaluate": + self.eval_expr = value + + def perform_checks(self): + super().perform_checks() + if not self.options: + raise ValueError(self.name + ": options missing in condition") + if not self.eval_expr: + raise ValueError(self.name + ": expression missing in condition") + + def __repr__(self): + opt_cond_str = "OptionCondition: " + self.name + opt_cond_str += " options: " + str(self.options) + opt_cond_str += " expression: " + self.eval_expr + if self.trigger: + opt_cond_str += " trigger: " + str(self.trigger) + return opt_cond_str + + +class TimeSeriesCondition(Condition): + @classmethod + def create(cls, base_condition): + base_condition.set_data_source(DataSource.Type["TIME_SERIES"]) + base_condition.__class__ = cls + return base_condition + + def set_parameter(self, key, value): + if key == "keys": + if isinstance(value, str): + self.keys = [value] + else: + self.keys = value + elif key == "behavior": + self.behavior = TimeSeriesData.Behavior[value] + elif key == "rate_threshold": + self.rate_threshold = float(value) + elif key == "window_sec": + self.window_sec = int(value) + elif key == "evaluate": + self.expression = value + elif key == "aggregation_op": + self.aggregation_op = TimeSeriesData.AggregationOperator[value] + + def perform_checks(self): + if not self.keys: + raise ValueError(self.name + ": specify timeseries key") + if not self.behavior: + raise ValueError(self.name + ": specify triggering behavior") + if self.behavior is TimeSeriesData.Behavior.bursty: + if not self.rate_threshold: + raise ValueError(self.name + ": specify rate burst threshold") + if not self.window_sec: + self.window_sec = 300 # default window length is 5 minutes + if len(self.keys) > 1: + raise ValueError(self.name + ": specify only one key") + elif self.behavior is TimeSeriesData.Behavior.evaluate_expression: + if not (self.expression): + raise ValueError(self.name + ": specify evaluation expression") + else: + raise ValueError(self.name + ": trigger behavior not supported") + + def __repr__(self): + ts_cond_str = "TimeSeriesCondition: " + self.name + ts_cond_str += " statistics: " + str(self.keys) + ts_cond_str += " behavior: " + self.behavior.name + if self.behavior is TimeSeriesData.Behavior.bursty: + ts_cond_str += " rate_threshold: " + str(self.rate_threshold) + ts_cond_str += " window_sec: " + str(self.window_sec) + if self.behavior is TimeSeriesData.Behavior.evaluate_expression: + ts_cond_str += " expression: " + self.expression + if hasattr(self, "aggregation_op"): + ts_cond_str += " aggregation_op: " + self.aggregation_op.name + if self.trigger: + ts_cond_str += " trigger: " + str(self.trigger) + return ts_cond_str + + +class RulesSpec: + def __init__(self, rules_path): + self.file_path = rules_path + + def initialise_fields(self): + self.rules_dict = {} + self.conditions_dict = {} + self.suggestions_dict = {} + + def perform_section_checks(self): + for rule in self.rules_dict.values(): + rule.perform_checks() + for cond in self.conditions_dict.values(): + cond.perform_checks() + for sugg in self.suggestions_dict.values(): + sugg.perform_checks() + + def load_rules_from_spec(self): + self.initialise_fields() + with open(self.file_path, "r") as db_rules: + curr_section = None + for line in db_rules: + line = IniParser.remove_trailing_comment(line) + if not line: + continue + element = IniParser.get_element(line) + if element is IniParser.Element.comment: + continue + elif element is not IniParser.Element.key_val: + curr_section = element # it's a new IniParser header + section_name = IniParser.get_section_name(line) + if element is IniParser.Element.rule: + new_rule = Rule(section_name) + self.rules_dict[section_name] = new_rule + elif element is IniParser.Element.cond: + new_cond = Condition(section_name) + self.conditions_dict[section_name] = new_cond + elif element is IniParser.Element.sugg: + new_suggestion = Suggestion(section_name) + self.suggestions_dict[section_name] = new_suggestion + elif element is IniParser.Element.key_val: + key, value = IniParser.get_key_value_pair(line) + if curr_section is IniParser.Element.rule: + new_rule.set_parameter(key, value) + elif curr_section is IniParser.Element.cond: + if key == "source": + if value == "LOG": + new_cond = LogCondition.create(new_cond) + elif value == "OPTIONS": + new_cond = OptionCondition.create(new_cond) + elif value == "TIME_SERIES": + new_cond = TimeSeriesCondition.create(new_cond) + else: + new_cond.set_parameter(key, value) + elif curr_section is IniParser.Element.sugg: + new_suggestion.set_parameter(key, value) + + def get_rules_dict(self): + return self.rules_dict + + def get_conditions_dict(self): + return self.conditions_dict + + def get_suggestions_dict(self): + return self.suggestions_dict + + def get_triggered_rules(self, data_sources, column_families): + self.trigger_conditions(data_sources) + triggered_rules = [] + for rule in self.rules_dict.values(): + if rule.is_triggered(self.conditions_dict, column_families): + triggered_rules.append(rule) + return triggered_rules + + def trigger_conditions(self, data_sources): + for source_type in data_sources: + cond_subset = [ + cond + for cond in self.conditions_dict.values() + if cond.get_data_source() is source_type + ] + if not cond_subset: + continue + for source in data_sources[source_type]: + source.check_and_trigger_conditions(cond_subset) + + def print_rules(self, rules): + for rule in rules: + print("\nRule: " + rule.name) + for cond_name in rule.conditions: + print(repr(self.conditions_dict[cond_name])) + for sugg_name in rule.suggestions: + print(repr(self.suggestions_dict[sugg_name])) + if rule.trigger_entities: + print("scope: entities:") + print(rule.trigger_entities) + if rule.trigger_column_families: + print("scope: col_fam:") + print(rule.trigger_column_families) diff --git a/src/rocksdb/tools/advisor/advisor/rule_parser_example.py b/src/rocksdb/tools/advisor/advisor/rule_parser_example.py new file mode 100644 index 000000000..6c04ff2bf --- /dev/null +++ b/src/rocksdb/tools/advisor/advisor/rule_parser_example.py @@ -0,0 +1,98 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import argparse + +from advisor.db_log_parser import DatabaseLogs, DataSource +from advisor.db_options_parser import DatabaseOptions +from advisor.db_stats_fetcher import LogStatsParser, OdsStatsFetcher +from advisor.rule_parser import RulesSpec + + +def main(args): + # initialise the RulesSpec parser + rule_spec_parser = RulesSpec(args.rules_spec) + rule_spec_parser.load_rules_from_spec() + rule_spec_parser.perform_section_checks() + # initialize the DatabaseOptions object + db_options = DatabaseOptions(args.rocksdb_options) + # Create DatabaseLogs object + db_logs = DatabaseLogs(args.log_files_path_prefix, db_options.get_column_families()) + # Create the Log STATS object + db_log_stats = LogStatsParser( + args.log_files_path_prefix, args.stats_dump_period_sec + ) + data_sources = { + DataSource.Type.DB_OPTIONS: [db_options], + DataSource.Type.LOG: [db_logs], + DataSource.Type.TIME_SERIES: [db_log_stats], + } + if args.ods_client: + data_sources[DataSource.Type.TIME_SERIES].append( + OdsStatsFetcher( + args.ods_client, + args.ods_entity, + args.ods_tstart, + args.ods_tend, + args.ods_key_prefix, + ) + ) + triggered_rules = rule_spec_parser.get_triggered_rules( + data_sources, db_options.get_column_families() + ) + rule_spec_parser.print_rules(triggered_rules) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Use this script to get\ + suggestions for improving Rocksdb performance." + ) + parser.add_argument( + "--rules_spec", + required=True, + type=str, + help="path of the file containing the expert-specified Rules", + ) + parser.add_argument( + "--rocksdb_options", + required=True, + type=str, + help="path of the starting Rocksdb OPTIONS file", + ) + parser.add_argument( + "--log_files_path_prefix", + required=True, + type=str, + help="path prefix of the Rocksdb LOG files", + ) + parser.add_argument( + "--stats_dump_period_sec", + required=True, + type=int, + help="the frequency (in seconds) at which STATISTICS are printed to " + + "the Rocksdb LOG file", + ) + # ODS arguments + parser.add_argument("--ods_client", type=str, help="the ODS client binary") + parser.add_argument( + "--ods_entity", + type=str, + help="the servers for which the ODS stats need to be fetched", + ) + parser.add_argument( + "--ods_key_prefix", + type=str, + help="the prefix that needs to be attached to the keys of time " + + "series to be fetched from ODS", + ) + parser.add_argument( + "--ods_tstart", type=int, help="start time of timeseries to be fetched from ODS" + ) + parser.add_argument( + "--ods_tend", type=int, help="end time of timeseries to be fetched from ODS" + ) + args = parser.parse_args() + main(args) diff --git a/src/rocksdb/tools/advisor/advisor/rules.ini b/src/rocksdb/tools/advisor/advisor/rules.ini new file mode 100644 index 000000000..ec7a07e60 --- /dev/null +++ b/src/rocksdb/tools/advisor/advisor/rules.ini @@ -0,0 +1,214 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). +# +# FORMAT: very similar to the Rocksdb ini file in terms of syntax +# (refer rocksdb/examples/rocksdb_option_file_example.ini) +# +# The Rules INI file is made up of multiple sections and each section is made +# up of multiple key-value pairs. The recognized section types are: +# Rule, Suggestion, Condition. Each section must have a name specified in "" +# in the section header. This name acts as an identifier in that section +# type's namespace. A section header looks like: +# [<section_type> "<section_name_identifier>"] +# +# There should be at least one Rule section in the file with its corresponding +# Condition and Suggestion sections. A Rule is triggered only when all of its +# conditions are triggered. The order in which a Rule's conditions and +# suggestions are specified has no significance. +# +# A Condition must be associated with a data source specified by the parameter +# 'source' and this must be the first parameter specified for the Condition. +# A condition can be associated with one or more Rules. +# +# A Suggestion is an advised change to a Rocksdb option to improve the +# performance of the database in some way. Every suggestion can be a part of +# one or more Rules. + +[Rule "stall-too-many-memtables"] +suggestions=inc-bg-flush:inc-write-buffer +conditions=stall-too-many-memtables + +[Condition "stall-too-many-memtables"] +source=LOG +regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+ + +[Rule "stall-too-many-L0"] +suggestions=inc-max-subcompactions:inc-max-bg-compactions:inc-write-buffer-size:dec-max-bytes-for-level-base:inc-l0-slowdown-writes-trigger +conditions=stall-too-many-L0 + +[Condition "stall-too-many-L0"] +source=LOG +regex=Stalling writes because we have \d+ level-0 files + +[Rule "stop-too-many-L0"] +suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-l0-stop-writes-trigger +conditions=stop-too-many-L0 + +[Condition "stop-too-many-L0"] +source=LOG +regex=Stopping writes because we have \d+ level-0 files + +[Rule "stall-too-many-compaction-bytes"] +suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-hard-pending-compaction-bytes-limit:inc-soft-pending-compaction-bytes-limit +conditions=stall-too-many-compaction-bytes + +[Condition "stall-too-many-compaction-bytes"] +source=LOG +regex=Stalling writes because of estimated pending compaction bytes \d+ + +[Suggestion "inc-bg-flush"] +option=DBOptions.max_background_flushes +action=increase +suggested_values=2 + +[Suggestion "inc-write-buffer"] +option=CFOptions.max_write_buffer_number +action=increase + +[Suggestion "inc-max-subcompactions"] +option=DBOptions.max_subcompactions +action=increase + +[Suggestion "inc-max-bg-compactions"] +option=DBOptions.max_background_compactions +action=increase +suggested_values=2 + +[Suggestion "inc-write-buffer-size"] +option=CFOptions.write_buffer_size +action=increase + +[Suggestion "dec-max-bytes-for-level-base"] +option=CFOptions.max_bytes_for_level_base +action=decrease + +[Suggestion "inc-l0-slowdown-writes-trigger"] +option=CFOptions.level0_slowdown_writes_trigger +action=increase + +[Suggestion "inc-l0-stop-writes-trigger"] +option=CFOptions.level0_stop_writes_trigger +action=increase + +[Suggestion "inc-hard-pending-compaction-bytes-limit"] +option=CFOptions.hard_pending_compaction_bytes_limit +action=increase + +[Suggestion "inc-soft-pending-compaction-bytes-limit"] +option=CFOptions.soft_pending_compaction_bytes_limit +action=increase + +[Rule "level0-level1-ratio"] +conditions=level0-level1-ratio +suggestions=inc-base-max-bytes + +[Condition "level0-level1-ratio"] +source=OPTIONS +options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base +evaluate=int(options[0])*int(options[1])-int(options[2])>=1 # should evaluate to a boolean, condition triggered if evaluates to true + +[Suggestion "inc-base-max-bytes"] +option=CFOptions.max_bytes_for_level_base +action=increase + +[Rules "tuning-iostat-burst"] +conditions=large-db-get-p99 +suggestions=bytes-per-sync-non0:wal-bytes-per-sync-non0:set-rate-limiter +#overlap_time_period=10m + +[Condition "write-burst"] +source=TIME_SERIES +keys=dyno.flash_write_bytes_per_sec +behavior=bursty +window_sec=300 # the smaller this window, the more sensitivity to changes in the time series, so the rate_threshold should be bigger; when it's 60, then same as diff(%) +rate_threshold=20 + +[Condition "large-p99-read-latency"] +source=TIME_SERIES +keys=[]rocksdb.read.block.get.micros.p99 +behavior=bursty +window_sec=300 +rate_threshold=10 + +[Condition "large-db-get-p99"] +source=TIME_SERIES +keys=[]rocksdb.db.get.micros.p50:[]rocksdb.db.get.micros.p99 +behavior=evaluate_expression +evaluate=(keys[1]/keys[0])>5 + +[Suggestion "bytes-per-sync-non0"] +option=DBOptions.bytes_per_sync +action=set +suggested_values=1048576 + +[Suggestion "wal-bytes-per-sync-non0"] +option=DBOptions.wal_bytes_per_sync +action=set +suggested_values=1048576 + +[Suggestion "set-rate-limiter"] +option=rate_limiter_bytes_per_sec +action=set +suggested_values=1024000 + +[Rule "bloom-filter-percent-useful"] +conditions=bloom-filter-percent-useful +suggestions=inc-bloom-bits-per-key + +[Condition "bloom-filter-percent-useful"] +source=TIME_SERIES +keys=[]rocksdb.bloom.filter.useful.count:[]rocksdb.bloom.filter.full.positive.count:[]rocksdb.bloom.filter.full.true.positive.count +behavior=evaluate_expression +evaluate=((keys[0]+keys[2])/(keys[0]+keys[1]))<0.9 # should evaluate to a boolean +aggregation_op=latest + +[Rule "bloom-not-enabled"] +conditions=bloom-not-enabled +suggestions=inc-bloom-bits-per-key + +[Condition "bloom-not-enabled"] +source=TIME_SERIES +keys=[]rocksdb.bloom.filter.useful.count:[]rocksdb.bloom.filter.full.positive.count:[]rocksdb.bloom.filter.full.true.positive.count +behavior=evaluate_expression +evaluate=keys[0]+keys[1]+keys[2]==0 +aggregation_op=avg + +[Suggestion "inc-bloom-bits-per-key"] +option=bloom_bits +action=increase +suggested_values=2 + +[Rule "small-l0-files"] +conditions=small-l0-files +suggestions=dec-max-bytes-for-level-base:inc-write-buffer-size + +[Condition "small-l0-files"] +source=OPTIONS +options=CFOptions.max_bytes_for_level_base:CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size +evaluate=int(options[0])>(10*int(options[1])*int(options[2])) + +[Rule "decompress-time-long"] +conditions=decompress-time-long +suggestions=dec-block-size:inc-block-cache-size:faster-compression-type + +[Condition "decompress-time-long"] +source=TIME_SERIES +keys=block_decompress_time:block_read_time:block_checksum_time +behavior=evaluate_expression +evaluate=(keys[0]/(keys[0]+keys[1]+keys[2]))>0.3 + +[Suggestion "dec-block-size"] +option=TableOptions.BlockBasedTable.block_size +action=decrease + +[Suggestion "inc-block-cache-size"] +option=cache_size +action=increase +suggested_values=16000000 + +[Suggestion "faster-compression-type"] +option=CFOptions.compression +action=set +suggested_values=kLZ4Compression diff --git a/src/rocksdb/tools/advisor/test/__init__.py b/src/rocksdb/tools/advisor/test/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/src/rocksdb/tools/advisor/test/__init__.py diff --git a/src/rocksdb/tools/advisor/test/input_files/LOG-0 b/src/rocksdb/tools/advisor/test/input_files/LOG-0 new file mode 100644 index 000000000..3c9d51641 --- /dev/null +++ b/src/rocksdb/tools/advisor/test/input_files/LOG-0 @@ -0,0 +1,30 @@ +2018/05/25-14:30:05.601692 7f82bd676200 RocksDB version: 5.14.0 +2018/05/25-14:30:07.626719 7f82ba72e700 (Original Log Time 2018/05/25-14:30:07.621966) [db/db_impl_compaction_flush.cc:1424] Calling FlushMemTableToOutputFile with column family [default], flush slots available 1, compaction slots available 1, flush slots scheduled 1, compaction slots scheduled 0 +2018/05/25-14:30:07.626725 7f82ba72e700 [db/flush_job.cc:301] [default] [JOB 3] Flushing memtable with next log file: 8 +2018/05/25-14:30:07.626738 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283807626732, "job": 3, "event": "flush_started", "num_memtables": 1, "num_entries": 28018, "num_deletes": 0, "memory_usage": 4065512, "flush_reason": "Write Buffer Full"} +2018/05/25-14:30:07.626740 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 3] Level-0 flush table #10: started +2018/05/25-14:30:07.764232 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #11. Immutable memtables: 1. +2018/05/25-14:30:07.764240 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2 +2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stalling writes because we have 4 level-0 files rate 39886 +2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stopping writes because we have 4 level-0 files rate 39886 +2018/05/25-14:30:09.398302 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283809398276, "cf_name": "default", "job": 3, "event": "table_file_creation", "file_number": 10, "file_size": 1890434, "table_properties": {"data_size": 1876749, "index_size": 23346, "filter_size": 0, "raw_key_size": 663120, "raw_average_key_size": 24, "raw_value_size": 2763000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27630, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}} +2018/05/25-14:30:09.398351 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 3] Level-0 flush table #10: 1890434 bytes OK +2018/05/25-14:30:25.491635 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table #23: started +2018/05/25-14:30:25.643618 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #24. Immutable memtables: 1. +2018/05/25-14:30:25.643633 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2 +2018/05/25-14:30:27.288181 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283827288158, "cf_name": "default", "job": 10, "event": "table_file_creation", "file_number": 23, "file_size": 1893200, "table_properties": {"data_size": 1879460, "index_size": 23340, "filter_size": 0, "raw_key_size": 663360, "raw_average_key_size": 24, "raw_value_size": 2764000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27640, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}} +2018/05/25-14:30:27.288210 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 10] Level-0 flush table #23: 1893200 bytes OK +2018/05/25-14:30:27.289353 7f82ba72e700 [WARN] [db/column_family.cc:764] [default] Stalling writes because of estimated pending compaction bytes 14410584 +2018/05/25-14:30:27.289390 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.288829) [db/memtable_list.cc:377] [default] Level-0 commit table #23 started +2018/05/25-14:30:27.289393 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.289332) [db/memtable_list.cc:409] [default] Level-0 commit table #23: memtable #1 done +2018/05/25-14:34:21.047206 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527284061047181, "cf_name": "default", "job": 44, "event": "table_file_creation", "file_number": 84, "file_size": 1890780, "table_properties": {"data_size": 1877100, "index_size": 23309, "filter_size": 0, "raw_key_size": 662808, "raw_average_key_size": 24, "raw_value_size": 2761700, "raw_average_value_size": 100, "num_data_blocks": 837, "num_entries": 27617, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}} +2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 44] Level-0 flush table #84: 1890780 bytes OK +2018/05/25-14:34:21.048017 7f82ba72e700 (Original Log Time 2018/05/25-14:34:21.048005) EVENT_LOG_v1 {"time_micros": 1527284061047997, "job": 44, "event": "flush_finished", "output_compression": "Snappy", "lsm_state": [2, 1, 0, 0, 0, 0, 0], "immutable_memtables": 1} +2018/05/25-14:34:21.048592 7f82bd676200 [DEBUG] [db/db_impl_files.cc:261] [JOB 45] Delete /tmp/rocksdbtest-155919/dbbench/000084.sst type=2 #84 -- OK +2018/05/25-14:34:21.048603 7f82bd676200 EVENT_LOG_v1 {"time_micros": 1527284061048600, "job": 45, "event": "table_file_deletion", "file_number": 84} +2018/05/25-14:34:21.048981 7f82bd676200 [db/db_impl.cc:398] Shutdown complete +2018/05/25-14:34:21.049000 7f82bd676200 [db/db_impl.cc:563] [col-fam-A] random log message for testing +2018/05/25-14:34:21.049010 7f82bd676200 [db/db_impl.cc:234] [col-fam-B] log continuing on next line +remaining part of the log +2018/05/25-14:34:21.049020 7f82bd676200 [db/db_impl.cc:653] [col-fam-A] another random log message +2018/05/25-14:34:21.049025 7f82bd676200 [db/db_impl.cc:331] [unknown] random log message no column family diff --git a/src/rocksdb/tools/advisor/test/input_files/LOG-1 b/src/rocksdb/tools/advisor/test/input_files/LOG-1 new file mode 100644 index 000000000..b163f9a99 --- /dev/null +++ b/src/rocksdb/tools/advisor/test/input_files/LOG-1 @@ -0,0 +1,25 @@ +2018/05/25-14:30:05.601692 7f82bd676200 RocksDB version: 5.14.0 +2018/05/25-14:30:07.626719 7f82ba72e700 (Original Log Time 2018/05/25-14:30:07.621966) [db/db_impl_compaction_flush.cc:1424] Calling FlushMemTableToOutputFile with column family [default], flush slots available 1, compaction slots available 1, flush slots scheduled 1, compaction slots scheduled 0 +2018/05/25-14:30:07.626725 7f82ba72e700 [db/flush_job.cc:301] [default] [JOB 3] Flushing memtable with next log file: 8 +2018/05/25-14:30:07.626738 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283807626732, "job": 3, "event": "flush_started", "num_memtables": 1, "num_entries": 28018, "num_deletes": 0, "memory_usage": 4065512, "flush_reason": "Write Buffer Full"} +2018/05/25-14:30:07.626740 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 3] Level-0 flush table #10: started +2018/05/25-14:30:07.764232 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #11. Immutable memtables: 1. +2018/05/25-14:30:07.764240 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2 +2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stalling writes because we have 4 level-0 files rate 39886 +2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stopping writes because we have 4 level-0 files rate 39886 +2018/05/25-14:30:09.398302 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283809398276, "cf_name": "default", "job": 3, "event": "table_file_creation", "file_number": 10, "file_size": 1890434, "table_properties": {"data_size": 1876749, "index_size": 23346, "filter_size": 0, "raw_key_size": 663120, "raw_average_key_size": 24, "raw_value_size": 2763000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27630, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}} +2018/05/25-14:30:09.398351 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 3] Level-0 flush table #10: 1890434 bytes OK +2018/05/25-14:30:25.491635 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table #23: started +2018/05/25-14:30:25.643618 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #24. Immutable memtables: 1. +2018/05/25-14:30:25.643633 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2 +2018/05/25-14:30:27.288181 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283827288158, "cf_name": "default", "job": 10, "event": "table_file_creation", "file_number": 23, "file_size": 1893200, "table_properties": {"data_size": 1879460, "index_size": 23340, "filter_size": 0, "raw_key_size": 663360, "raw_average_key_size": 24, "raw_value_size": 2764000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27640, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}} +2018/05/25-14:30:27.288210 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 10] Level-0 flush table #23: 1893200 bytes OK +2018/05/25-14:30:27.289353 7f82ba72e700 [WARN] [db/column_family.cc:764] [default] Stopping writes because of estimated pending compaction bytes 14410584 +2018/05/25-14:30:27.289390 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.288829) [db/memtable_list.cc:377] [default] Level-0 commit table #23 started +2018/05/25-14:30:27.289393 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.289332) [db/memtable_list.cc:409] [default] Level-0 commit table #23: memtable #1 done +2018/05/25-14:34:21.047206 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527284061047181, "cf_name": "default", "job": 44, "event": "table_file_creation", "file_number": 84, "file_size": 1890780, "table_properties": {"data_size": 1877100, "index_size": 23309, "filter_size": 0, "raw_key_size": 662808, "raw_average_key_size": 24, "raw_value_size": 2761700, "raw_average_value_size": 100, "num_data_blocks": 837, "num_entries": 27617, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}} +2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 44] Level-0 flush table #84: 1890780 bytes OK +2018/05/25-14:34:21.048017 7f82ba72e700 (Original Log Time 2018/05/25-14:34:21.048005) EVENT_LOG_v1 {"time_micros": 1527284061047997, "job": 44, "event": "flush_finished", "output_compression": "Snappy", "lsm_state": [2, 1, 0, 0, 0, 0, 0], "immutable_memtables": 1} +2018/05/25-14:34:21.048592 7f82bd676200 [DEBUG] [db/db_impl_files.cc:261] [JOB 45] Delete /tmp/rocksdbtest-155919/dbbench/000084.sst type=2 #84 -- OK +2018/05/25-14:34:21.048603 7f82bd676200 EVENT_LOG_v1 {"time_micros": 1527284061048600, "job": 45, "event": "table_file_deletion", "file_number": 84} +2018/05/25-14:34:21.048981 7f82bd676200 [db/db_impl.cc:398] Shutdown complete diff --git a/src/rocksdb/tools/advisor/test/input_files/OPTIONS-000005 b/src/rocksdb/tools/advisor/test/input_files/OPTIONS-000005 new file mode 100644 index 000000000..009edb04d --- /dev/null +++ b/src/rocksdb/tools/advisor/test/input_files/OPTIONS-000005 @@ -0,0 +1,49 @@ +# This is a RocksDB option file. +# +# For detailed file format spec, please refer to the example file +# in examples/rocksdb_option_file_example.ini +# + +[Version] + rocksdb_version=5.14.0 + options_file_version=1.1 + +[DBOptions] + manual_wal_flush=false + allow_ingest_behind=false + db_write_buffer_size=0 + db_log_dir= + random_access_max_buffer_size=1048576 + +[CFOptions "default"] + ttl=0 + max_bytes_for_level_base=268435456 + max_bytes_for_level_multiplier=10.000000 + level0_file_num_compaction_trigger=4 + level0_stop_writes_trigger=36 + write_buffer_size=4194000 + min_write_buffer_number_to_merge=1 + num_levels=7 + compaction_filter_factory=nullptr + compaction_style=kCompactionStyleLevel + +[TableOptions/BlockBasedTable "default"] + block_align=false + index_type=kBinarySearch + +[CFOptions "col_fam_A"] +ttl=0 +max_bytes_for_level_base=268435456 +max_bytes_for_level_multiplier=10.000000 +level0_file_num_compaction_trigger=5 +level0_stop_writes_trigger=36 +write_buffer_size=1024000 +min_write_buffer_number_to_merge=1 +num_levels=5 +compaction_filter_factory=nullptr +compaction_style=kCompactionStyleLevel + +[TableOptions/BlockBasedTable "col_fam_A"] +block_align=true +block_restart_interval=16 +index_type=kBinarySearch diff --git a/src/rocksdb/tools/advisor/test/input_files/log_stats_parser_keys_ts b/src/rocksdb/tools/advisor/test/input_files/log_stats_parser_keys_ts new file mode 100644 index 000000000..e8ade9e3e --- /dev/null +++ b/src/rocksdb/tools/advisor/test/input_files/log_stats_parser_keys_ts @@ -0,0 +1,3 @@ +rocksdb.number.block.decompressed.count: 1530896335 88.0, 1530896361 788338.0, 1530896387 1539256.0, 1530896414 2255696.0, 1530896440 3009325.0, 1530896466 3767183.0, 1530896492 4529775.0, 1530896518 5297809.0, 1530896545 6033802.0, 1530896570 6794129.0 +rocksdb.db.get.micros.p50: 1530896335 295.5, 1530896361 16.561841, 1530896387 16.20677, 1530896414 16.31508, 1530896440 16.346602, 1530896466 16.284669, 1530896492 16.16005, 1530896518 16.069096, 1530896545 16.028746, 1530896570 15.9638 +rocksdb.manifest.file.sync.micros.p99: 1530896335 649.0, 1530896361 835.0, 1530896387 1435.0, 1530896414 9938.0, 1530896440 9938.0, 1530896466 9938.0, 1530896492 9938.0, 1530896518 1882.0, 1530896545 1837.0, 1530896570 1792.0 diff --git a/src/rocksdb/tools/advisor/test/input_files/rules_err1.ini b/src/rocksdb/tools/advisor/test/input_files/rules_err1.ini new file mode 100644 index 000000000..23be55dde --- /dev/null +++ b/src/rocksdb/tools/advisor/test/input_files/rules_err1.ini @@ -0,0 +1,56 @@ +[Rule "missing-suggestions"] +suggestions= +conditions=missing-source + +[Condition "normal-rule"] +source=LOG +regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+ + +[Suggestion "inc-bg-flush"] +option=DBOptions.max_background_flushes +action=increase + +[Suggestion "inc-write-buffer"] +option=CFOptions.max_write_buffer_number +action=increase + +[Rule "missing-conditions"] +conditions= +suggestions=missing-description + +[Condition "missing-options"] +source=OPTIONS +options= +evaluate=int(options[0])*int(options[1])-int(options[2])<(-251659456) # should evaluate to a boolean + +[Rule "missing-expression"] +conditions=missing-expression +suggestions=missing-description + +[Condition "missing-expression"] +source=OPTIONS +options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base +evaluate= + +[Suggestion "missing-description"] +description= + +[Rule "stop-too-many-L0"] +suggestions=inc-max-bg-compactions:missing-action:inc-l0-stop-writes-trigger +conditions=missing-regex + +[Condition "missing-regex"] +source=LOG +regex= + +[Suggestion "missing-option"] +option= +action=increase + +[Suggestion "normal-suggestion"] +option=CFOptions.write_buffer_size +action=increase + +[Suggestion "inc-l0-stop-writes-trigger"] +option=CFOptions.level0_stop_writes_trigger +action=increase diff --git a/src/rocksdb/tools/advisor/test/input_files/rules_err2.ini b/src/rocksdb/tools/advisor/test/input_files/rules_err2.ini new file mode 100644 index 000000000..bce21dba9 --- /dev/null +++ b/src/rocksdb/tools/advisor/test/input_files/rules_err2.ini @@ -0,0 +1,15 @@ +[Rule "normal-rule"] +suggestions=inc-bg-flush:inc-write-buffer +conditions=missing-source + +[Condition "missing-source"] +source= +regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+ + +[Suggestion "inc-bg-flush"] +option=DBOptions.max_background_flushes +action=increase + +[Suggestion "inc-write-buffer"] +option=CFOptions.max_write_buffer_number +action=increase diff --git a/src/rocksdb/tools/advisor/test/input_files/rules_err3.ini b/src/rocksdb/tools/advisor/test/input_files/rules_err3.ini new file mode 100644 index 000000000..73c06e469 --- /dev/null +++ b/src/rocksdb/tools/advisor/test/input_files/rules_err3.ini @@ -0,0 +1,15 @@ +[Rule "normal-rule"] +suggestions=missing-action:inc-write-buffer +conditions=missing-source + +[Condition "normal-condition"] +source=LOG +regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+ + +[Suggestion "missing-action"] +option=DBOptions.max_background_flushes +action= + +[Suggestion "inc-write-buffer"] +option=CFOptions.max_write_buffer_number +action=increase diff --git a/src/rocksdb/tools/advisor/test/input_files/rules_err4.ini b/src/rocksdb/tools/advisor/test/input_files/rules_err4.ini new file mode 100644 index 000000000..4d4aa3c70 --- /dev/null +++ b/src/rocksdb/tools/advisor/test/input_files/rules_err4.ini @@ -0,0 +1,15 @@ +[Rule "normal-rule"] +suggestions=inc-bg-flush +conditions=missing-source + +[Condition "normal-condition"] +source=LOG +regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+ + +[Suggestion "inc-bg-flush"] +option=DBOptions.max_background_flushes +action=increase + +[Suggestion] # missing section name +option=CFOptions.max_write_buffer_number +action=increase diff --git a/src/rocksdb/tools/advisor/test/input_files/test_rules.ini b/src/rocksdb/tools/advisor/test/input_files/test_rules.ini new file mode 100644 index 000000000..97b9374fc --- /dev/null +++ b/src/rocksdb/tools/advisor/test/input_files/test_rules.ini @@ -0,0 +1,47 @@ +[Rule "single-condition-false"] +suggestions=inc-bg-flush:inc-write-buffer +conditions=log-4-false + +[Rule "multiple-conds-true"] +suggestions=inc-write-buffer +conditions=log-1-true:log-2-true:log-3-true + +[Rule "multiple-conds-one-false"] +suggestions=inc-bg-flush +conditions=log-1-true:log-4-false:log-3-true + +[Rule "multiple-conds-all-false"] +suggestions=l0-l1-ratio-health-check +conditions=log-4-false:options-1-false + +[Condition "log-1-true"] +source=LOG +regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+ + +[Condition "log-2-true"] +source=LOG +regex=Stalling writes because we have \d+ level-0 files + +[Condition "log-3-true"] +source=LOG +regex=Stopping writes because we have \d+ level-0 files + +[Condition "log-4-false"] +source=LOG +regex=Stalling writes because of estimated pending compaction bytes \d+ + +[Condition "options-1-false"] +source=OPTIONS +options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:DBOptions.random_access_max_buffer_size +evaluate=int(options[0])*int(options[1])-int(options[2])<0 # should evaluate to a boolean + +[Suggestion "inc-bg-flush"] +option=DBOptions.max_background_flushes +action=increase + +[Suggestion "inc-write-buffer"] +option=CFOptions.max_write_buffer_number +action=increase + +[Suggestion "l0-l1-ratio-health-check"] +description='modify options such that (level0_file_num_compaction_trigger * write_buffer_size - max_bytes_for_level_base < 5) is satisfied' diff --git a/src/rocksdb/tools/advisor/test/input_files/triggered_rules.ini b/src/rocksdb/tools/advisor/test/input_files/triggered_rules.ini new file mode 100644 index 000000000..83b96da2b --- /dev/null +++ b/src/rocksdb/tools/advisor/test/input_files/triggered_rules.ini @@ -0,0 +1,83 @@ +[Rule "stall-too-many-memtables"] +suggestions=inc-bg-flush:inc-write-buffer +conditions=stall-too-many-memtables + +[Condition "stall-too-many-memtables"] +source=LOG +regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+ + +[Rule "stall-too-many-L0"] +suggestions=inc-max-subcompactions:inc-max-bg-compactions:inc-write-buffer-size:dec-max-bytes-for-level-base:inc-l0-slowdown-writes-trigger +conditions=stall-too-many-L0 + +[Condition "stall-too-many-L0"] +source=LOG +regex=Stalling writes because we have \d+ level-0 files + +[Rule "stop-too-many-L0"] +suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-l0-stop-writes-trigger +conditions=stop-too-many-L0 + +[Condition "stop-too-many-L0"] +source=LOG +regex=Stopping writes because we have \d+ level-0 files + +[Rule "stall-too-many-compaction-bytes"] +suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-hard-pending-compaction-bytes-limit:inc-soft-pending-compaction-bytes-limit +conditions=stall-too-many-compaction-bytes + +[Condition "stall-too-many-compaction-bytes"] +source=LOG +regex=Stalling writes because of estimated pending compaction bytes \d+ + +[Suggestion "inc-bg-flush"] +option=DBOptions.max_background_flushes +action=increase + +[Suggestion "inc-write-buffer"] +option=CFOptions.max_write_buffer_number +action=increase + +[Suggestion "inc-max-subcompactions"] +option=DBOptions.max_subcompactions +action=increase + +[Suggestion "inc-max-bg-compactions"] +option=DBOptions.max_background_compactions +action=increase + +[Suggestion "inc-write-buffer-size"] +option=CFOptions.write_buffer_size +action=increase + +[Suggestion "dec-max-bytes-for-level-base"] +option=CFOptions.max_bytes_for_level_base +action=decrease + +[Suggestion "inc-l0-slowdown-writes-trigger"] +option=CFOptions.level0_slowdown_writes_trigger +action=increase + +[Suggestion "inc-l0-stop-writes-trigger"] +option=CFOptions.level0_stop_writes_trigger +action=increase + +[Suggestion "inc-hard-pending-compaction-bytes-limit"] +option=CFOptions.hard_pending_compaction_bytes_limit +action=increase + +[Suggestion "inc-soft-pending-compaction-bytes-limit"] +option=CFOptions.soft_pending_compaction_bytes_limit +action=increase + +[Rule "level0-level1-ratio"] +conditions=level0-level1-ratio +suggestions=l0-l1-ratio-health-check + +[Condition "level0-level1-ratio"] +source=OPTIONS +options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base +evaluate=int(options[0])*int(options[1])-int(options[2])>=-268173312 # should evaluate to a boolean, condition triggered if evaluates to true + +[Suggestion "l0-l1-ratio-health-check"] +description='modify options such that (level0_file_num_compaction_trigger * write_buffer_size - max_bytes_for_level_base < -268173312) is satisfied' diff --git a/src/rocksdb/tools/advisor/test/test_db_bench_runner.py b/src/rocksdb/tools/advisor/test/test_db_bench_runner.py new file mode 100644 index 000000000..57306c942 --- /dev/null +++ b/src/rocksdb/tools/advisor/test/test_db_bench_runner.py @@ -0,0 +1,141 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import os +import unittest + +from advisor.db_bench_runner import DBBenchRunner +from advisor.db_log_parser import DataSource, NO_COL_FAMILY +from advisor.db_options_parser import DatabaseOptions + + +class TestDBBenchRunnerMethods(unittest.TestCase): + def setUp(self): + self.pos_args = [ + "./../../db_bench", + "overwrite", + "use_existing_db=true", + "duration=10", + ] + self.bench_runner = DBBenchRunner(self.pos_args) + this_path = os.path.abspath(os.path.dirname(__file__)) + options_path = os.path.join(this_path, "input_files/OPTIONS-000005") + self.db_options = DatabaseOptions(options_path) + + def test_setup(self): + self.assertEqual(self.bench_runner.db_bench_binary, self.pos_args[0]) + self.assertEqual(self.bench_runner.benchmark, self.pos_args[1]) + self.assertSetEqual( + set(self.bench_runner.db_bench_args), set(self.pos_args[2:]) + ) + + def test_get_info_log_file_name(self): + log_file_name = DBBenchRunner.get_info_log_file_name(None, "random_path") + self.assertEqual(log_file_name, "LOG") + + log_file_name = DBBenchRunner.get_info_log_file_name( + "/dev/shm/", "/tmp/rocksdbtest-155919/dbbench/" + ) + self.assertEqual(log_file_name, "tmp_rocksdbtest-155919_dbbench_LOG") + + def test_get_opt_args_str(self): + misc_opt_dict = {"bloom_bits": 2, "empty_opt": None, "rate_limiter": 3} + optional_args_str = DBBenchRunner.get_opt_args_str(misc_opt_dict) + self.assertEqual(optional_args_str, " --bloom_bits=2 --rate_limiter=3") + + def test_get_log_options(self): + db_path = "/tmp/rocksdb-155919/dbbench" + # when db_log_dir is present in the db_options + update_dict = { + "DBOptions.db_log_dir": {NO_COL_FAMILY: "/dev/shm"}, + "DBOptions.stats_dump_period_sec": {NO_COL_FAMILY: "20"}, + } + self.db_options.update_options(update_dict) + log_file_prefix, stats_freq = self.bench_runner.get_log_options( + self.db_options, db_path + ) + self.assertEqual(log_file_prefix, "/dev/shm/tmp_rocksdb-155919_dbbench_LOG") + self.assertEqual(stats_freq, 20) + + update_dict = { + "DBOptions.db_log_dir": {NO_COL_FAMILY: None}, + "DBOptions.stats_dump_period_sec": {NO_COL_FAMILY: "30"}, + } + self.db_options.update_options(update_dict) + log_file_prefix, stats_freq = self.bench_runner.get_log_options( + self.db_options, db_path + ) + self.assertEqual(log_file_prefix, "/tmp/rocksdb-155919/dbbench/LOG") + self.assertEqual(stats_freq, 30) + + def test_build_experiment_command(self): + # add some misc_options to db_options + update_dict = { + "bloom_bits": {NO_COL_FAMILY: 2}, + "rate_limiter_bytes_per_sec": {NO_COL_FAMILY: 128000000}, + } + self.db_options.update_options(update_dict) + db_path = "/dev/shm" + experiment_command = self.bench_runner._build_experiment_command( + self.db_options, db_path + ) + opt_args_str = DBBenchRunner.get_opt_args_str( + self.db_options.get_misc_options() + ) + opt_args_str += " --options_file=" + self.db_options.generate_options_config( + "12345" + ) + for arg in self.pos_args[2:]: + opt_args_str += " --" + arg + expected_command = ( + self.pos_args[0] + + " --benchmarks=" + + self.pos_args[1] + + " --statistics --perf_level=3 --db=" + + db_path + + opt_args_str + ) + self.assertEqual(experiment_command, expected_command) + + +class TestDBBenchRunner(unittest.TestCase): + def setUp(self): + # Note: the db_bench binary should be present in the rocksdb/ directory + self.pos_args = [ + "./../../db_bench", + "overwrite", + "use_existing_db=true", + "duration=20", + ] + self.bench_runner = DBBenchRunner(self.pos_args) + this_path = os.path.abspath(os.path.dirname(__file__)) + options_path = os.path.join(this_path, "input_files/OPTIONS-000005") + self.db_options = DatabaseOptions(options_path) + + def test_experiment_output(self): + update_dict = {"bloom_bits": {NO_COL_FAMILY: 2}} + self.db_options.update_options(update_dict) + db_path = "/dev/shm" + data_sources, throughput = self.bench_runner.run_experiment( + self.db_options, db_path + ) + self.assertEqual( + data_sources[DataSource.Type.DB_OPTIONS][0].type, DataSource.Type.DB_OPTIONS + ) + self.assertEqual(data_sources[DataSource.Type.LOG][0].type, DataSource.Type.LOG) + self.assertEqual(len(data_sources[DataSource.Type.TIME_SERIES]), 2) + self.assertEqual( + data_sources[DataSource.Type.TIME_SERIES][0].type, + DataSource.Type.TIME_SERIES, + ) + self.assertEqual( + data_sources[DataSource.Type.TIME_SERIES][1].type, + DataSource.Type.TIME_SERIES, + ) + self.assertEqual(data_sources[DataSource.Type.TIME_SERIES][1].stats_freq_sec, 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/rocksdb/tools/advisor/test/test_db_log_parser.py b/src/rocksdb/tools/advisor/test/test_db_log_parser.py new file mode 100644 index 000000000..6862691c1 --- /dev/null +++ b/src/rocksdb/tools/advisor/test/test_db_log_parser.py @@ -0,0 +1,96 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import os +import unittest + +from advisor.db_log_parser import DatabaseLogs, Log, NO_COL_FAMILY +from advisor.rule_parser import Condition, LogCondition + + +class TestLog(unittest.TestCase): + def setUp(self): + self.column_families = ["default", "col_fam_A"] + + def test_get_column_family(self): + test_log = ( + "2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] " + + "[col_fam_A] [JOB 44] Level-0 flush table #84: 1890780 bytes OK" + ) + db_log = Log(test_log, self.column_families) + self.assertEqual("col_fam_A", db_log.get_column_family()) + + test_log = ( + "2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] " + + "[JOB 44] Level-0 flush table #84: 1890780 bytes OK" + ) + db_log = Log(test_log, self.column_families) + db_log.append_message("[default] some remaining part of log") + self.assertEqual(NO_COL_FAMILY, db_log.get_column_family()) + + def test_get_methods(self): + hr_time = "2018/05/25-14:30:25.491635" + context = "7f82ba72e700" + message = ( + "[db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table " + + "#23: started" + ) + test_log = hr_time + " " + context + " " + message + db_log = Log(test_log, self.column_families) + self.assertEqual(db_log.get_message(), message) + remaining_message = "[col_fam_A] some more logs" + db_log.append_message(remaining_message) + self.assertEqual(db_log.get_human_readable_time(), "2018/05/25-14:30:25.491635") + self.assertEqual(db_log.get_context(), "7f82ba72e700") + self.assertEqual(db_log.get_timestamp(), 1527258625) + self.assertEqual(db_log.get_message(), str(message + "\n" + remaining_message)) + + def test_is_new_log(self): + new_log = "2018/05/25-14:34:21.047233 context random new log" + remaining_log = "2018/05/25 not really a new log" + self.assertTrue(Log.is_new_log(new_log)) + self.assertFalse(Log.is_new_log(remaining_log)) + + +class TestDatabaseLogs(unittest.TestCase): + def test_check_and_trigger_conditions(self): + this_path = os.path.abspath(os.path.dirname(__file__)) + logs_path_prefix = os.path.join(this_path, "input_files/LOG-0") + column_families = ["default", "col-fam-A", "col-fam-B"] + db_logs = DatabaseLogs(logs_path_prefix, column_families) + # matches, has 2 col_fams + condition1 = LogCondition.create(Condition("cond-A")) + condition1.set_parameter("regex", "random log message") + # matches, multiple lines message + condition2 = LogCondition.create(Condition("cond-B")) + condition2.set_parameter("regex", "continuing on next line") + # does not match + condition3 = LogCondition.create(Condition("cond-C")) + condition3.set_parameter("regex", "this should match no log") + db_logs.check_and_trigger_conditions([condition1, condition2, condition3]) + cond1_trigger = condition1.get_trigger() + self.assertEqual(2, len(cond1_trigger.keys())) + self.assertSetEqual({"col-fam-A", NO_COL_FAMILY}, set(cond1_trigger.keys())) + self.assertEqual(2, len(cond1_trigger["col-fam-A"])) + messages = [ + "[db/db_impl.cc:563] [col-fam-A] random log message for testing", + "[db/db_impl.cc:653] [col-fam-A] another random log message", + ] + self.assertIn(cond1_trigger["col-fam-A"][0].get_message(), messages) + self.assertIn(cond1_trigger["col-fam-A"][1].get_message(), messages) + self.assertEqual(1, len(cond1_trigger[NO_COL_FAMILY])) + self.assertEqual( + cond1_trigger[NO_COL_FAMILY][0].get_message(), + "[db/db_impl.cc:331] [unknown] random log message no column family", + ) + cond2_trigger = condition2.get_trigger() + self.assertEqual(["col-fam-B"], list(cond2_trigger.keys())) + self.assertEqual(1, len(cond2_trigger["col-fam-B"])) + self.assertEqual( + cond2_trigger["col-fam-B"][0].get_message(), + "[db/db_impl.cc:234] [col-fam-B] log continuing on next line\n" + + "remaining part of the log", + ) + self.assertIsNone(condition3.get_trigger()) diff --git a/src/rocksdb/tools/advisor/test/test_db_options_parser.py b/src/rocksdb/tools/advisor/test/test_db_options_parser.py new file mode 100644 index 000000000..cdeebaefa --- /dev/null +++ b/src/rocksdb/tools/advisor/test/test_db_options_parser.py @@ -0,0 +1,214 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import os +import unittest + +from advisor.db_log_parser import NO_COL_FAMILY +from advisor.db_options_parser import DatabaseOptions +from advisor.rule_parser import Condition, OptionCondition + + +class TestDatabaseOptions(unittest.TestCase): + def setUp(self): + self.this_path = os.path.abspath(os.path.dirname(__file__)) + self.og_options = os.path.join(self.this_path, "input_files/OPTIONS-000005") + misc_options = ["bloom_bits = 4", "rate_limiter_bytes_per_sec = 1024000"] + # create the options object + self.db_options = DatabaseOptions(self.og_options, misc_options) + # perform clean-up before running tests + self.generated_options = os.path.join( + self.this_path, "../temp/OPTIONS_testing.tmp" + ) + if os.path.isfile(self.generated_options): + os.remove(self.generated_options) + + def test_get_options_diff(self): + old_opt = { + "DBOptions.stats_dump_freq_sec": {NO_COL_FAMILY: "20"}, + "CFOptions.write_buffer_size": { + "default": "1024000", + "col_fam_A": "128000", + "col_fam_B": "128000000", + }, + "DBOptions.use_fsync": {NO_COL_FAMILY: "true"}, + "DBOptions.max_log_file_size": {NO_COL_FAMILY: "128000000"}, + } + new_opt = { + "bloom_bits": {NO_COL_FAMILY: "4"}, + "CFOptions.write_buffer_size": { + "default": "128000000", + "col_fam_A": "128000", + "col_fam_C": "128000000", + }, + "DBOptions.use_fsync": {NO_COL_FAMILY: "true"}, + "DBOptions.max_log_file_size": {NO_COL_FAMILY: "0"}, + } + diff = DatabaseOptions.get_options_diff(old_opt, new_opt) + + expected_diff = { + "DBOptions.stats_dump_freq_sec": {NO_COL_FAMILY: ("20", None)}, + "bloom_bits": {NO_COL_FAMILY: (None, "4")}, + "CFOptions.write_buffer_size": { + "default": ("1024000", "128000000"), + "col_fam_B": ("128000000", None), + "col_fam_C": (None, "128000000"), + }, + "DBOptions.max_log_file_size": {NO_COL_FAMILY: ("128000000", "0")}, + } + self.assertDictEqual(diff, expected_diff) + + def test_is_misc_option(self): + self.assertTrue(DatabaseOptions.is_misc_option("bloom_bits")) + self.assertFalse( + DatabaseOptions.is_misc_option("DBOptions.stats_dump_freq_sec") + ) + + def test_set_up(self): + options = self.db_options.get_all_options() + self.assertEqual(22, len(options.keys())) + expected_misc_options = { + "bloom_bits": "4", + "rate_limiter_bytes_per_sec": "1024000", + } + self.assertDictEqual(expected_misc_options, self.db_options.get_misc_options()) + self.assertListEqual( + ["default", "col_fam_A"], self.db_options.get_column_families() + ) + + def test_get_options(self): + opt_to_get = [ + "DBOptions.manual_wal_flush", + "DBOptions.db_write_buffer_size", + "bloom_bits", + "CFOptions.compaction_filter_factory", + "CFOptions.num_levels", + "rate_limiter_bytes_per_sec", + "TableOptions.BlockBasedTable.block_align", + "random_option", + ] + options = self.db_options.get_options(opt_to_get) + expected_options = { + "DBOptions.manual_wal_flush": {NO_COL_FAMILY: "false"}, + "DBOptions.db_write_buffer_size": {NO_COL_FAMILY: "0"}, + "bloom_bits": {NO_COL_FAMILY: "4"}, + "CFOptions.compaction_filter_factory": { + "default": "nullptr", + "col_fam_A": "nullptr", + }, + "CFOptions.num_levels": {"default": "7", "col_fam_A": "5"}, + "rate_limiter_bytes_per_sec": {NO_COL_FAMILY: "1024000"}, + "TableOptions.BlockBasedTable.block_align": { + "default": "false", + "col_fam_A": "true", + }, + } + self.assertDictEqual(expected_options, options) + + def test_update_options(self): + # add new, update old, set old + # before updating + expected_old_opts = { + "DBOptions.db_log_dir": {NO_COL_FAMILY: None}, + "DBOptions.manual_wal_flush": {NO_COL_FAMILY: "false"}, + "bloom_bits": {NO_COL_FAMILY: "4"}, + "CFOptions.num_levels": {"default": "7", "col_fam_A": "5"}, + "TableOptions.BlockBasedTable.block_restart_interval": {"col_fam_A": "16"}, + } + get_opts = list(expected_old_opts.keys()) + options = self.db_options.get_options(get_opts) + self.assertEqual(expected_old_opts, options) + # after updating options + update_opts = { + "DBOptions.db_log_dir": {NO_COL_FAMILY: "/dev/shm"}, + "DBOptions.manual_wal_flush": {NO_COL_FAMILY: "true"}, + "bloom_bits": {NO_COL_FAMILY: "2"}, + "CFOptions.num_levels": {"col_fam_A": "7"}, + "TableOptions.BlockBasedTable.block_restart_interval": {"default": "32"}, + "random_misc_option": {NO_COL_FAMILY: "something"}, + } + self.db_options.update_options(update_opts) + update_opts["CFOptions.num_levels"]["default"] = "7" + update_opts["TableOptions.BlockBasedTable.block_restart_interval"] = { + "default": "32", + "col_fam_A": "16", + } + get_opts.append("random_misc_option") + options = self.db_options.get_options(get_opts) + self.assertDictEqual(update_opts, options) + expected_misc_options = { + "bloom_bits": "2", + "rate_limiter_bytes_per_sec": "1024000", + "random_misc_option": "something", + } + self.assertDictEqual(expected_misc_options, self.db_options.get_misc_options()) + + def test_generate_options_config(self): + # make sure file does not exist from before + self.assertFalse(os.path.isfile(self.generated_options)) + self.db_options.generate_options_config("testing") + self.assertTrue(os.path.isfile(self.generated_options)) + + def test_check_and_trigger_conditions(self): + # options only from CFOptions + # setup the OptionCondition objects to check and trigger + update_dict = { + "CFOptions.level0_file_num_compaction_trigger": {"col_fam_A": "4"}, + "CFOptions.max_bytes_for_level_base": {"col_fam_A": "10"}, + } + self.db_options.update_options(update_dict) + cond1 = Condition("opt-cond-1") + cond1 = OptionCondition.create(cond1) + cond1.set_parameter( + "options", + [ + "CFOptions.level0_file_num_compaction_trigger", + "TableOptions.BlockBasedTable.block_restart_interval", + "CFOptions.max_bytes_for_level_base", + ], + ) + cond1.set_parameter( + "evaluate", "int(options[0])*int(options[1])-int(options[2])>=0" + ) + # only DBOptions + cond2 = Condition("opt-cond-2") + cond2 = OptionCondition.create(cond2) + cond2.set_parameter( + "options", + [ + "DBOptions.db_write_buffer_size", + "bloom_bits", + "rate_limiter_bytes_per_sec", + ], + ) + cond2.set_parameter( + "evaluate", "(int(options[2]) * int(options[1]) * int(options[0]))==0" + ) + # mix of CFOptions and DBOptions + cond3 = Condition("opt-cond-3") + cond3 = OptionCondition.create(cond3) + cond3.set_parameter( + "options", + [ + "DBOptions.db_write_buffer_size", # 0 + "CFOptions.num_levels", # 5, 7 + "bloom_bits", # 4 + ], + ) + cond3.set_parameter( + "evaluate", "int(options[2])*int(options[0])+int(options[1])>6" + ) + self.db_options.check_and_trigger_conditions([cond1, cond2, cond3]) + + cond1_trigger = {"col_fam_A": ["4", "16", "10"]} + self.assertDictEqual(cond1_trigger, cond1.get_trigger()) + cond2_trigger = {NO_COL_FAMILY: ["0", "4", "1024000"]} + self.assertDictEqual(cond2_trigger, cond2.get_trigger()) + cond3_trigger = {"default": ["0", "7", "4"]} + self.assertDictEqual(cond3_trigger, cond3.get_trigger()) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/rocksdb/tools/advisor/test/test_db_stats_fetcher.py b/src/rocksdb/tools/advisor/test/test_db_stats_fetcher.py new file mode 100644 index 000000000..e2c29ab74 --- /dev/null +++ b/src/rocksdb/tools/advisor/test/test_db_stats_fetcher.py @@ -0,0 +1,121 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import os +import time +import unittest +from unittest.mock import MagicMock + +from advisor.db_stats_fetcher import DatabasePerfContext, LogStatsParser +from advisor.db_timeseries_parser import NO_ENTITY +from advisor.rule_parser import Condition, TimeSeriesCondition + + +class TestLogStatsParser(unittest.TestCase): + def setUp(self): + this_path = os.path.abspath(os.path.dirname(__file__)) + stats_file = os.path.join(this_path, "input_files/log_stats_parser_keys_ts") + # populate the keys_ts dictionary of LogStatsParser + self.stats_dict = {NO_ENTITY: {}} + with open(stats_file, "r") as fp: + for line in fp: + stat_name = line.split(":")[0].strip() + self.stats_dict[NO_ENTITY][stat_name] = {} + token_list = line.split(":")[1].strip().split(",") + for token in token_list: + timestamp = int(token.split()[0]) + value = float(token.split()[1]) + self.stats_dict[NO_ENTITY][stat_name][timestamp] = value + self.log_stats_parser = LogStatsParser("dummy_log_file", 20) + self.log_stats_parser.keys_ts = self.stats_dict + + def test_check_and_trigger_conditions_bursty(self): + # mock fetch_timeseries() because 'keys_ts' has been pre-populated + self.log_stats_parser.fetch_timeseries = MagicMock() + # condition: bursty + cond1 = Condition("cond-1") + cond1 = TimeSeriesCondition.create(cond1) + cond1.set_parameter("keys", "rocksdb.db.get.micros.p50") + cond1.set_parameter("behavior", "bursty") + cond1.set_parameter("window_sec", 40) + cond1.set_parameter("rate_threshold", 0) + self.log_stats_parser.check_and_trigger_conditions([cond1]) + expected_cond_trigger = {NO_ENTITY: {1530896440: 0.9767546362322214}} + self.assertDictEqual(expected_cond_trigger, cond1.get_trigger()) + # ensure that fetch_timeseries() was called once + self.log_stats_parser.fetch_timeseries.assert_called_once() + + def test_check_and_trigger_conditions_eval_agg(self): + # mock fetch_timeseries() because 'keys_ts' has been pre-populated + self.log_stats_parser.fetch_timeseries = MagicMock() + # condition: evaluate_expression + cond1 = Condition("cond-1") + cond1 = TimeSeriesCondition.create(cond1) + cond1.set_parameter("keys", "rocksdb.db.get.micros.p50") + cond1.set_parameter("behavior", "evaluate_expression") + keys = ["rocksdb.manifest.file.sync.micros.p99", "rocksdb.db.get.micros.p50"] + cond1.set_parameter("keys", keys) + cond1.set_parameter("aggregation_op", "latest") + # condition evaluates to FALSE + cond1.set_parameter("evaluate", "keys[0]-(keys[1]*100)>200") + self.log_stats_parser.check_and_trigger_conditions([cond1]) + expected_cond_trigger = {NO_ENTITY: [1792.0, 15.9638]} + self.assertIsNone(cond1.get_trigger()) + # condition evaluates to TRUE + cond1.set_parameter("evaluate", "keys[0]-(keys[1]*100)<200") + self.log_stats_parser.check_and_trigger_conditions([cond1]) + expected_cond_trigger = {NO_ENTITY: [1792.0, 15.9638]} + self.assertDictEqual(expected_cond_trigger, cond1.get_trigger()) + # ensure that fetch_timeseries() was called + self.log_stats_parser.fetch_timeseries.assert_called() + + def test_check_and_trigger_conditions_eval(self): + # mock fetch_timeseries() because 'keys_ts' has been pre-populated + self.log_stats_parser.fetch_timeseries = MagicMock() + # condition: evaluate_expression + cond1 = Condition("cond-1") + cond1 = TimeSeriesCondition.create(cond1) + cond1.set_parameter("keys", "rocksdb.db.get.micros.p50") + cond1.set_parameter("behavior", "evaluate_expression") + keys = ["rocksdb.manifest.file.sync.micros.p99", "rocksdb.db.get.micros.p50"] + cond1.set_parameter("keys", keys) + cond1.set_parameter("evaluate", "keys[0]-(keys[1]*100)>500") + self.log_stats_parser.check_and_trigger_conditions([cond1]) + expected_trigger = { + NO_ENTITY: { + 1530896414: [9938.0, 16.31508], + 1530896440: [9938.0, 16.346602], + 1530896466: [9938.0, 16.284669], + 1530896492: [9938.0, 16.16005], + } + } + self.assertDictEqual(expected_trigger, cond1.get_trigger()) + self.log_stats_parser.fetch_timeseries.assert_called_once() + + +class TestDatabasePerfContext(unittest.TestCase): + def test_unaccumulate_metrics(self): + perf_dict = { + "user_key_comparison_count": 675903942, + "block_cache_hit_count": 830086, + } + timestamp = int(time.time()) + perf_ts = {} + for key in perf_dict: + perf_ts[key] = {} + start_val = perf_dict[key] + for ix in range(5): + perf_ts[key][timestamp + (ix * 10)] = start_val + (2 * ix * ix) + db_perf_context = DatabasePerfContext(perf_ts, 10, True) + timestamps = [timestamp + (ix * 10) for ix in range(1, 5, 1)] + values = [val for val in range(2, 15, 4)] + inner_dict = {timestamps[ix]: values[ix] for ix in range(4)} + expected_keys_ts = { + NO_ENTITY: { + "user_key_comparison_count": inner_dict, + "block_cache_hit_count": inner_dict, + } + } + self.assertDictEqual(expected_keys_ts, db_perf_context.keys_ts) diff --git a/src/rocksdb/tools/advisor/test/test_rule_parser.py b/src/rocksdb/tools/advisor/test/test_rule_parser.py new file mode 100644 index 000000000..4ea4ca159 --- /dev/null +++ b/src/rocksdb/tools/advisor/test/test_rule_parser.py @@ -0,0 +1,226 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import os +import unittest + +from advisor.db_log_parser import DatabaseLogs, DataSource +from advisor.db_options_parser import DatabaseOptions +from advisor.rule_parser import RulesSpec + +RuleToSuggestions = { + "stall-too-many-memtables": ["inc-bg-flush", "inc-write-buffer"], + "stall-too-many-L0": [ + "inc-max-subcompactions", + "inc-max-bg-compactions", + "inc-write-buffer-size", + "dec-max-bytes-for-level-base", + "inc-l0-slowdown-writes-trigger", + ], + "stop-too-many-L0": [ + "inc-max-bg-compactions", + "inc-write-buffer-size", + "inc-l0-stop-writes-trigger", + ], + "stall-too-many-compaction-bytes": [ + "inc-max-bg-compactions", + "inc-write-buffer-size", + "inc-hard-pending-compaction-bytes-limit", + "inc-soft-pending-compaction-bytes-limit", + ], + "level0-level1-ratio": ["l0-l1-ratio-health-check"], +} + + +class TestAllRulesTriggered(unittest.TestCase): + def setUp(self): + # load the Rules + this_path = os.path.abspath(os.path.dirname(__file__)) + ini_path = os.path.join(this_path, "input_files/triggered_rules.ini") + self.db_rules = RulesSpec(ini_path) + self.db_rules.load_rules_from_spec() + self.db_rules.perform_section_checks() + # load the data sources: LOG and OPTIONS + log_path = os.path.join(this_path, "input_files/LOG-0") + options_path = os.path.join(this_path, "input_files/OPTIONS-000005") + db_options_parser = DatabaseOptions(options_path) + self.column_families = db_options_parser.get_column_families() + db_logs_parser = DatabaseLogs(log_path, self.column_families) + self.data_sources = { + DataSource.Type.DB_OPTIONS: [db_options_parser], + DataSource.Type.LOG: [db_logs_parser], + } + + def test_triggered_conditions(self): + conditions_dict = self.db_rules.get_conditions_dict() + rules_dict = self.db_rules.get_rules_dict() + # Make sure none of the conditions is triggered beforehand + for cond in conditions_dict.values(): + self.assertFalse(cond.is_triggered(), repr(cond)) + for rule in rules_dict.values(): + self.assertFalse( + rule.is_triggered(conditions_dict, self.column_families), repr(rule) + ) + + # # Trigger the conditions as per the data sources. + # trigger_conditions(, conditions_dict) + + # Get the set of rules that have been triggered + triggered_rules = self.db_rules.get_triggered_rules( + self.data_sources, self.column_families + ) + + # Make sure each condition and rule is triggered + for cond in conditions_dict.values(): + if cond.get_data_source() is DataSource.Type.TIME_SERIES: + continue + self.assertTrue(cond.is_triggered(), repr(cond)) + + for rule in rules_dict.values(): + self.assertIn(rule, triggered_rules) + # Check the suggestions made by the triggered rules + for sugg in rule.get_suggestions(): + self.assertIn(sugg, RuleToSuggestions[rule.name]) + + for rule in triggered_rules: + self.assertIn(rule, rules_dict.values()) + for sugg in RuleToSuggestions[rule.name]: + self.assertIn(sugg, rule.get_suggestions()) + + +class TestConditionsConjunctions(unittest.TestCase): + def setUp(self): + # load the Rules + this_path = os.path.abspath(os.path.dirname(__file__)) + ini_path = os.path.join(this_path, "input_files/test_rules.ini") + self.db_rules = RulesSpec(ini_path) + self.db_rules.load_rules_from_spec() + self.db_rules.perform_section_checks() + # load the data sources: LOG and OPTIONS + log_path = os.path.join(this_path, "input_files/LOG-1") + options_path = os.path.join(this_path, "input_files/OPTIONS-000005") + db_options_parser = DatabaseOptions(options_path) + self.column_families = db_options_parser.get_column_families() + db_logs_parser = DatabaseLogs(log_path, self.column_families) + self.data_sources = { + DataSource.Type.DB_OPTIONS: [db_options_parser], + DataSource.Type.LOG: [db_logs_parser], + } + + def test_condition_conjunctions(self): + conditions_dict = self.db_rules.get_conditions_dict() + rules_dict = self.db_rules.get_rules_dict() + # Make sure none of the conditions is triggered beforehand + for cond in conditions_dict.values(): + self.assertFalse(cond.is_triggered(), repr(cond)) + for rule in rules_dict.values(): + self.assertFalse( + rule.is_triggered(conditions_dict, self.column_families), repr(rule) + ) + + # Trigger the conditions as per the data sources. + self.db_rules.trigger_conditions(self.data_sources) + + # Check for the conditions + conds_triggered = ["log-1-true", "log-2-true", "log-3-true"] + conds_not_triggered = ["log-4-false", "options-1-false"] + for cond in conds_triggered: + self.assertTrue(conditions_dict[cond].is_triggered(), repr(cond)) + for cond in conds_not_triggered: + self.assertFalse(conditions_dict[cond].is_triggered(), repr(cond)) + + # Check for the rules + rules_triggered = ["multiple-conds-true"] + rules_not_triggered = [ + "single-condition-false", + "multiple-conds-one-false", + "multiple-conds-all-false", + ] + for rule_name in rules_triggered: + rule = rules_dict[rule_name] + self.assertTrue( + rule.is_triggered(conditions_dict, self.column_families), repr(rule) + ) + for rule_name in rules_not_triggered: + rule = rules_dict[rule_name] + self.assertFalse( + rule.is_triggered(conditions_dict, self.column_families), repr(rule) + ) + + +class TestSanityChecker(unittest.TestCase): + def setUp(self): + this_path = os.path.abspath(os.path.dirname(__file__)) + ini_path = os.path.join(this_path, "input_files/rules_err1.ini") + db_rules = RulesSpec(ini_path) + db_rules.load_rules_from_spec() + self.rules_dict = db_rules.get_rules_dict() + self.conditions_dict = db_rules.get_conditions_dict() + self.suggestions_dict = db_rules.get_suggestions_dict() + + def test_rule_missing_suggestions(self): + regex = ".*rule must have at least one suggestion.*" + with self.assertRaisesRegex(ValueError, regex): + self.rules_dict["missing-suggestions"].perform_checks() + + def test_rule_missing_conditions(self): + regex = ".*rule must have at least one condition.*" + with self.assertRaisesRegex(ValueError, regex): + self.rules_dict["missing-conditions"].perform_checks() + + def test_condition_missing_regex(self): + regex = ".*provide regex for log condition.*" + with self.assertRaisesRegex(ValueError, regex): + self.conditions_dict["missing-regex"].perform_checks() + + def test_condition_missing_options(self): + regex = ".*options missing in condition.*" + with self.assertRaisesRegex(ValueError, regex): + self.conditions_dict["missing-options"].perform_checks() + + def test_condition_missing_expression(self): + regex = ".*expression missing in condition.*" + with self.assertRaisesRegex(ValueError, regex): + self.conditions_dict["missing-expression"].perform_checks() + + def test_suggestion_missing_option(self): + regex = ".*provide option or description.*" + with self.assertRaisesRegex(ValueError, regex): + self.suggestions_dict["missing-option"].perform_checks() + + def test_suggestion_missing_description(self): + regex = ".*provide option or description.*" + with self.assertRaisesRegex(ValueError, regex): + self.suggestions_dict["missing-description"].perform_checks() + + +class TestParsingErrors(unittest.TestCase): + def setUp(self): + self.this_path = os.path.abspath(os.path.dirname(__file__)) + + def test_condition_missing_source(self): + ini_path = os.path.join(self.this_path, "input_files/rules_err2.ini") + db_rules = RulesSpec(ini_path) + regex = ".*provide source for condition.*" + with self.assertRaisesRegex(NotImplementedError, regex): + db_rules.load_rules_from_spec() + + def test_suggestion_missing_action(self): + ini_path = os.path.join(self.this_path, "input_files/rules_err3.ini") + db_rules = RulesSpec(ini_path) + regex = ".*provide action for option.*" + with self.assertRaisesRegex(ValueError, regex): + db_rules.load_rules_from_spec() + + def test_section_no_name(self): + ini_path = os.path.join(self.this_path, "input_files/rules_err4.ini") + db_rules = RulesSpec(ini_path) + regex = "Parsing error: needed section header:.*" + with self.assertRaisesRegex(ValueError, regex): + db_rules.load_rules_from_spec() + + +if __name__ == "__main__": + unittest.main() diff --git a/src/rocksdb/tools/analyze_txn_stress_test.sh b/src/rocksdb/tools/analyze_txn_stress_test.sh new file mode 100755 index 000000000..477b1fac5 --- /dev/null +++ b/src/rocksdb/tools/analyze_txn_stress_test.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# Usage: +# 1. Enable ROCKS_LOG_DETAILS in util/logging.h +# 2. Run ./transaction_test --gtest_filter="MySQLStyleTransactionTest/MySQLStyleTransactionTest.TransactionStressTest/*" --gtest_break_on_failure +# 3. SET=1 # 2 or 3 +# 4. LOG=/dev/shm/transaction_testdb_8600601584148590297/LOG +# 5. grep RandomTransactionVerify $LOG | cut -d' ' -f 12 | sort -n # to find verify snapshots +# 5. vn=1345 +# 6. vn_1=1340 +# 4. . tools/tools/analyze_txn_stress_test.sh +echo Input params: +# The rocksdb LOG path +echo $LOG +# Snapshot at which we got RandomTransactionVerify failure +echo $vn +# The snapshot before that where RandomTransactionVerify passed +echo $vn_1 +# The stress tests use 3 sets, one or more might have shown inconsistent results. +SET=${SET-1} # 1 or 2 or 3 +echo Checking set number $SET + +# Find the txns that committed between the two snapshots, and gather their changes made by them in /tmp/changes.txt +# 2019/02/28-15:25:51.655477 7fffec9ff700 [DEBUG] [ilities/transactions/write_prepared_txn_db.cc:416] Txn 68497 Committing with 68498 +grep Committing $LOG | awk '{if ($9 <= vn && $9 > vn_1) print $0}' vn=$vn vn_1=${vn_1} > /tmp/txn.txt +# 2019/02/28-15:25:49.046464 7fffe81f5700 [DEBUG] [il/transaction_test_util.cc:216] Commit of 65541 OK (txn12936193128775589751-9089) +for i in `cat /tmp/txn.txt | awk '{print $6}'`; do grep "Commit of $i " $LOG; done > /tmp/names.txt +for n in `cat /tmp/names.txt | awk '{print $9}'`; do grep $n $LOG; done > /tmp/changes.txt +echo "Sum of the changes:" +cat /tmp/changes.txt | grep Insert | awk '{print $12}' | cut -d= -f1 | cut -d+ -f2 | awk '{sum+=$1} END{print sum}' + +# Gather read values at each snapshot +# 2019/02/28-15:25:51.655926 7fffebbff700 [DEBUG] [il/transaction_test_util.cc:347] VerifyRead at 67972 (67693): 000230 value: 15983 +grep "VerifyRead at ${vn_1} (.*): 000${SET}" $LOG | cut -d' ' -f 9- > /tmp/va.txt +grep "VerifyRead at ${vn} (.*): 000${SET}" $LOG | cut -d' ' -f 9- > /tmp/vb.txt + +# For each key in the 2nd snapshot, find the value read by 1st, do the adds, and see if the results match. +IFS=$'\n' +for l in `cat /tmp/vb.txt`; +do + grep $l /tmp/va.txt > /dev/null ; + if [[ $? -ne 0 ]]; then + #echo $l + k=`echo $l | awk '{print $1}'`; + v=`echo $l | awk '{print $3}'`; + # 2019/02/28-15:25:19.350111 7fffe81f5700 [DEBUG] [il/transaction_test_util.cc:194] Insert (txn12936193128775589751-2298) OK snap: 16289 key:000219 value: 3772+95=3867 + exp=`grep "\<$k\>" /tmp/changes.txt | tail -1 | cut -d= -f2`; + if [[ $v -ne $exp ]]; then echo $l; fi + else + k=`echo $l | awk '{print $1}'`; + grep "\<$k\>" /tmp/changes.txt + fi; +done + +# Check that all the keys read in the 1st snapshot are still visible in the 2nd +for l in `cat /tmp/va.txt`; +do + k=`echo $l | awk '{print $1}'`; + grep "\<$k\>" /tmp/vb.txt > /dev/null + if [[ $? -ne 0 ]]; then + echo missing key $k + fi +done + +# The following found a bug in ValidateSnapshot. It checks if the adds on each key match up. +grep Insert /tmp/changes.txt | cut -d' ' -f 10 | sort | uniq > /tmp/keys.txt +for k in `cat /tmp/keys.txt`; +do + grep "\<$k\>" /tmp/changes.txt > /tmp/adds.txt; + # 2019/02/28-15:25:19.350111 7fffe81f5700 [DEBUG] [il/transaction_test_util.cc:194] Insert (txn12936193128775589751-2298) OK snap: 16289 key:000219 value: 3772+95=3867 + START=`head -1 /tmp/adds.txt | cut -d' ' -f 12 | cut -d+ -f1` + END=`tail -1 /tmp/adds.txt | cut -d' ' -f 12 | cut -d= -f2` + ADDS=`cat /tmp/adds.txt | grep Insert | awk '{print $12}' | cut -d= -f1 | cut -d+ -f2 | awk '{sum+=$1} END{print sum}'` + EXP=$((START+ADDS)) + # If first + all the adds != last then there was an issue with ValidateSnapshot. + if [[ $END -ne $EXP ]]; then echo inconsistent txn: $k $START+$ADDS=$END; cat /tmp/adds.txt; return 1; fi +done diff --git a/src/rocksdb/tools/auto_sanity_test.sh b/src/rocksdb/tools/auto_sanity_test.sh new file mode 100755 index 000000000..4670ef9bb --- /dev/null +++ b/src/rocksdb/tools/auto_sanity_test.sh @@ -0,0 +1,93 @@ +# shellcheck disable=SC2148 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +TMP_DIR="${TMPDIR:-/tmp}/rocksdb-sanity-test" + +if [ "$#" -lt 2 ]; then + echo "usage: ./auto_sanity_test.sh [new_commit] [old_commit]" + echo "Missing either [new_commit] or [old_commit], perform sanity check with the latest and 10th latest commits." + recent_commits=`git log | grep -e "^commit [a-z0-9]\+$"| head -n10 | sed -e 's/commit //g'` + commit_new=`echo "$recent_commits" | head -n1` + commit_old=`echo "$recent_commits" | tail -n1` + echo "the most recent commits are:" + echo "$recent_commits" +else + commit_new=$1 + commit_old=$2 +fi + +if [ ! -d $TMP_DIR ]; then + mkdir $TMP_DIR +fi +dir_new="${TMP_DIR}/${commit_new}" +dir_old="${TMP_DIR}/${commit_old}" + +function makestuff() { + echo "make clean" + make clean > /dev/null + echo "make db_sanity_test -j32" + make db_sanity_test -j32 > /dev/null + if [ $? -ne 0 ]; then + echo "[ERROR] Failed to perform 'make db_sanity_test'" + exit 1 + fi +} + +rm -r -f $dir_new +rm -r -f $dir_old + +echo "Running db sanity check with commits $commit_new and $commit_old." + +echo "=============================================================" +echo "Making build $commit_new" +git checkout $commit_new +if [ $? -ne 0 ]; then + echo "[ERROR] Can't checkout $commit_new" + exit 1 +fi +makestuff +mv db_sanity_test new_db_sanity_test +echo "Creating db based on the new commit --- $commit_new" +./new_db_sanity_test $dir_new create +cp ./tools/db_sanity_test.cc $dir_new +cp ./tools/auto_sanity_test.sh $dir_new + +echo "=============================================================" +echo "Making build $commit_old" +git checkout $commit_old +if [ $? -ne 0 ]; then + echo "[ERROR] Can't checkout $commit_old" + exit 1 +fi +cp -f $dir_new/db_sanity_test.cc ./tools/. +cp -f $dir_new/auto_sanity_test.sh ./tools/. +makestuff +mv db_sanity_test old_db_sanity_test +echo "Creating db based on the old commit --- $commit_old" +./old_db_sanity_test $dir_old create + +echo "=============================================================" +echo "[Backward Compatibility Check]" +echo "Verifying old db $dir_old using the new commit --- $commit_new" +./new_db_sanity_test $dir_old verify +if [ $? -ne 0 ]; then + echo "[ERROR] Backward Compatibility Check fails:" + echo " Verification of $dir_old using commit $commit_new failed." + exit 2 +fi + +echo "=============================================================" +echo "[Forward Compatibility Check]" +echo "Verifying new db $dir_new using the old commit --- $commit_old" +./old_db_sanity_test $dir_new verify +if [ $? -ne 0 ]; then + echo "[ERROR] Forward Compatibility Check fails:" + echo " $dir_new using commit $commit_old failed." + exit 2 +fi + +rm old_db_sanity_test +rm new_db_sanity_test +rm -rf $dir_new +rm -rf $dir_old + +echo "Auto sanity test passed!" diff --git a/src/rocksdb/tools/backup_db.sh b/src/rocksdb/tools/backup_db.sh new file mode 100755 index 000000000..aa82f1dba --- /dev/null +++ b/src/rocksdb/tools/backup_db.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# + +if [ "$#" -lt 2 ]; then + echo "usage: ${BASH_SOURCE[0]} <DB Path> <Backup Dir>" + exit 1 +fi + +db_dir="$1" +backup_dir="$2" + +echo "== Backing up DB $db_dir to $backup_dir" +./ldb backup --db="$db_dir" --backup_dir="$backup_dir" diff --git a/src/rocksdb/tools/benchmark.sh b/src/rocksdb/tools/benchmark.sh new file mode 100755 index 000000000..b41d25c78 --- /dev/null +++ b/src/rocksdb/tools/benchmark.sh @@ -0,0 +1,1173 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# REQUIRE: db_bench binary exists in the current directory + +# Exit Codes +EXIT_INVALID_ARGS=1 +EXIT_NOT_COMPACTION_TEST=2 +EXIT_UNKNOWN_JOB=3 + +# Size Constants +K=1024 +M=$((1024 * K)) +G=$((1024 * M)) +T=$((1024 * G)) + +function display_usage() { + echo "usage: benchmark.sh [--help] <test>" + echo "" + echo "These are the available benchmark tests:" + echo -e "\tbulkload" + echo -e "\tfillseq_disable_wal\t\tSequentially fill the database with no WAL" + echo -e "\tfillseq_enable_wal\t\tSequentially fill the database with WAL" + echo -e "\toverwrite" + echo -e "\tupdaterandom" + echo -e "\treadrandom" + echo -e "\tmergerandom" + echo -e "\tfilluniquerandom" + echo -e "\tmultireadrandom" + echo -e "\tfwdrange" + echo -e "\trevrange" + echo -e "\treadwhilewriting" + echo -e "\treadwhilemerging" + echo -e "\tfwdrangewhilewriting" + echo -e "\trevrangewhilewriting" + echo -e "\tfwdrangewhilemerging" + echo -e "\trevrangewhilemerging" + echo -e "\trandomtransaction" + echo -e "\tuniversal_compaction" + echo -e "\tdebug" + echo "" + echo "Generic enviroment Variables:" + echo -e "\tJOB_ID\t\t\t\tAn identifier for the benchmark job, will appear in the results" + echo -e "\tDB_DIR\t\t\t\tPath to write the database data directory" + echo -e "\tWAL_DIR\t\t\t\tPath to write the database WAL directory" + echo -e "\tOUTPUT_DIR\t\t\tPath to write the benchmark results to (default: /tmp)" + echo -e "\tNUM_KEYS\t\t\tThe number of keys to use in the benchmark" + echo -e "\tKEY_SIZE\t\t\tThe size of the keys to use in the benchmark (default: 20 bytes)" + echo -e "\tVALUE_SIZE\t\t\tThe size of the values to use in the benchmark (default: 400 bytes)" + echo -e "\tBLOCK_SIZE\t\t\tThe size of the database blocks in the benchmark (default: 8 KB)" + echo -e "\tDB_BENCH_NO_SYNC\t\tDisable fsync on the WAL" + echo -e "\tNUMACTL\t\t\t\tWhen defined use numactl --interleave=all" + echo -e "\tNUM_THREADS\t\t\tThe number of threads to use (default: 64)" + echo -e "\tMB_WRITE_PER_SEC\t\t\tRate limit for background writer" + echo -e "\tNUM_NEXTS_PER_SEEK\t\t(default: 10)" + echo -e "\tCACHE_SIZE\t\t\tSize of the block cache (default: 16GB)" + echo -e "\tCACHE_NUMSHARDBITS\t\t\tNumber of shards for the block cache is 2 ** cache_numshardbits (default: 6)" + echo -e "\tCOMPRESSION_MAX_DICT_BYTES" + echo -e "\tCOMPRESSION_TYPE\t\tDefault compression(default: zstd)" + echo -e "\tBOTTOMMOST_COMPRESSION\t\t(default: none)" + echo -e "\tMIN_LEVEL_TO_COMPRESS\t\tValue for min_level_to_compress for Leveled" + echo -e "\tCOMPRESSION_SIZE_PERCENT\tValue for compression_size_percent for Universal" + echo -e "\tDURATION\t\t\tNumber of seconds for which the test runs" + echo -e "\tWRITES\t\t\t\tNumber of writes for which the test runs" + echo -e "\tWRITE_BUFFER_SIZE_MB\t\tThe size of the write buffer in MB (default: 128)" + echo -e "\tTARGET_FILE_SIZE_BASE_MB\tThe value for target_file_size_base in MB (default: 128)" + echo -e "\tMAX_BYTES_FOR_LEVEL_BASE_MB\tThe value for max_bytes_for_level_base in MB (default: 128)" + echo -e "\tMAX_BACKGROUND_JOBS\t\tThe value for max_background_jobs (default: 16)" + echo -e "\tCACHE_INDEX_AND_FILTER_BLOCKS\tThe value for cache_index_and_filter_blocks (default: 0)" + echo -e "\tUSE_O_DIRECT\t\t\tUse O_DIRECT for user reads and compaction" + echo -e "\tBYTES_PER_SYNC\t\t\tValue for bytes_per_sync, set to zero when USE_O_DIRECT is true" + echo -e "\tSTATS_INTERVAL_SECONDS\t\tValue for stats_interval_seconds" + echo -e "\tREPORT_INTERVAL_SECONDS\t\tValue for report_interval_seconds" + echo -e "\tSUBCOMPACTIONS\t\t\tValue for subcompactions" + echo -e "\tCOMPACTION_STYLE\t\tOne of leveled, universal, blob. Default is leveled." + echo -e "\nEnvironment variables (mostly) for leveled compaction:" + echo -e "\tLEVEL0_FILE_NUM_COMPACTION_TRIGGER\t\tValue for level0_file_num_compaction_trigger" + echo -e "\tLEVEL0_SLOWDOWN_WRITES_TRIGGER\t\t\tValue for level0_slowdown_writes_trigger" + echo -e "\tLEVEL0_STOP_WRITES_TRIGGER\t\t\tValue for level0_stop_writes_trigger" + echo -e "\tPER_LEVEL_FANOUT\t\t\t\tValue for max_bytes_for_level_multiplier" + echo -e "\tSOFT_PENDING_COMPACTION_BYTES_LIMIT_IN_GB\tThe value for soft_pending_compaction_bytes_limit in GB" + echo -e "\tHARD_PENDING_COMPACTION_BYTES_LIMIT_IN_GB\tThe value for hard_pending_compaction_bytes_limit in GB" + echo -e "\nEnvironment variables for universal compaction:" + echo -e "\tUNIVERSAL_MIN_MERGE_WIDTH\tValue of min_merge_width option for universal" + echo -e "\tUNIVERSAL_MAX_MERGE_WIDTH\tValue of min_merge_width option for universal" + echo -e "\tUNIVERSAL_SIZE_RATIO\t\tValue of size_ratio option for universal" + echo -e "\tUNIVERSAL_MAX_SIZE_AMP\t\tmax_size_amplification_percent for universal" + echo -e "\tUNIVERSAL_ALLOW_TRIVIAL_MOVE\tSet allow_trivial_move to true for universal, default is false" + echo -e "\nOptions for integrated BlobDB" + echo -e "\tMIN_BLOB_SIZE\tValue for min_blob_size" + echo -e "\tBLOB_FILE_SIZE\tValue for blob_file_size" + echo -e "\tBLOB_COMPRESSION_TYPE\tValue for blob_compression_type" + echo -e "\tBLOB_GC_AGE_CUTOFF\tValue for blob_garbage_collection_age_cutoff" + echo -e "\tBLOB_GC_FORCE_THRESHOLD\tValue for blob_garbage_collection_force_threshold" + echo -e "\tBLOB_FILE_STARTING_LEVEL\t\tBlob file starting level (default: 0)" + echo -e "\tUSE_BLOB_CACHE\t\t\tEnable blob cache (default: 1)" + echo -e "\tUSE_SHARED_BLOCK_AND_BLOB_CACHE\t\t\tUse the same backing cache for block cache and blob cache (default: 1)" + echo -e "\tBLOB_CACHE_SIZE\t\t\tSize of the blob cache (default: 16GB)" + echo -e "\tBLOB_CACHE_NUMSHARDBITS\t\t\tNumber of shards for the blob cache is 2 ** blob_cache_numshardbits (default: 6)" + echo -e "\tPREPOPULATE_BLOB_CACHE\t\t\tPre-populate hot/warm blobs in blob cache (default: 0)" +} + +if [ $# -lt 1 ]; then + display_usage + exit $EXIT_INVALID_ARGS +fi +bench_cmd=$1 +shift +bench_args=$* + +if [[ "$bench_cmd" == "--help" ]]; then + display_usage + exit +fi + +job_id=${JOB_ID} + +# Make it easier to run only the compaction test. Getting valid data requires +# a number of iterations and having an ability to run the test separately from +# rest of the benchmarks helps. +if [ "$COMPACTION_TEST" == "1" -a "$bench_cmd" != "universal_compaction" ]; then + echo "Skipping $1 because it's not a compaction test." + exit $EXIT_NOT_COMPACTION_TEST +fi + +if [ -z $DB_DIR ]; then + echo "DB_DIR is not defined" + exit $EXIT_INVALID_ARGS +fi + +if [ -z $WAL_DIR ]; then + echo "WAL_DIR is not defined" + exit $EXIT_INVALID_ARGS +fi + +output_dir=${OUTPUT_DIR:-/tmp} +if [ ! -d $output_dir ]; then + mkdir -p $output_dir +fi + +report="$output_dir/report.tsv" +schedule="$output_dir/schedule.txt" + +# all multithreaded tests run with sync=1 unless +# $DB_BENCH_NO_SYNC is defined +syncval="1" +if [ ! -z $DB_BENCH_NO_SYNC ]; then + echo "Turning sync off for all multithreaded tests" + syncval="0"; +fi + +compaction_style=${COMPACTION_STYLE:-leveled} +if [ $compaction_style = "leveled" ]; then + echo Use leveled compaction +elif [ $compaction_style = "universal" ]; then + echo Use universal compaction +elif [ $compaction_style = "blob" ]; then + echo Use blob compaction +else + echo COMPACTION_STYLE is :: $COMPACTION_STYLE :: and must be one of leveled, universal, blob + exit $EXIT_INVALID_ARGS +fi + +num_threads=${NUM_THREADS:-64} +mb_written_per_sec=${MB_WRITE_PER_SEC:-0} +# Only for tests that do range scans +num_nexts_per_seek=${NUM_NEXTS_PER_SEEK:-10} +cache_size=${CACHE_SIZE:-$(( 16 * $G ))} +cache_numshardbits=${CACHE_NUMSHARDBITS:-6} +compression_max_dict_bytes=${COMPRESSION_MAX_DICT_BYTES:-0} +compression_type=${COMPRESSION_TYPE:-zstd} +min_level_to_compress=${MIN_LEVEL_TO_COMPRESS:-"-1"} +compression_size_percent=${COMPRESSION_SIZE_PERCENT:-"-1"} + +duration=${DURATION:-0} +writes=${WRITES:-0} + +num_keys=${NUM_KEYS:-8000000000} +key_size=${KEY_SIZE:-20} +value_size=${VALUE_SIZE:-400} +block_size=${BLOCK_SIZE:-8192} +write_buffer_mb=${WRITE_BUFFER_SIZE_MB:-128} +target_file_mb=${TARGET_FILE_SIZE_BASE_MB:-128} +l1_mb=${MAX_BYTES_FOR_LEVEL_BASE_MB:-1024} +max_background_jobs=${MAX_BACKGROUND_JOBS:-16} +stats_interval_seconds=${STATS_INTERVAL_SECONDS:-60} +report_interval_seconds=${REPORT_INTERVAL_SECONDS:-1} +subcompactions=${SUBCOMPACTIONS:-1} +per_level_fanout=${PER_LEVEL_FANOUT:-8} + +cache_index_and_filter=${CACHE_INDEX_AND_FILTER_BLOCKS:-0} +if [[ $cache_index_and_filter -eq 0 ]]; then + cache_meta_flags="" +elif [[ $cache_index_and_filter -eq 1 ]]; then + cache_meta_flags="\ + --cache_index_and_filter_blocks=$cache_index_and_filter \ + --cache_high_pri_pool_ratio=0.5 --cache_low_pri_pool_ratio=0" +else + echo CACHE_INDEX_AND_FILTER_BLOCKS was $CACHE_INDEX_AND_FILTER_BLOCKS but must be 0 or 1 + exit $EXIT_INVALID_ARGS +fi + +soft_pending_arg="" +if [ ! -z $SOFT_PENDING_COMPACTION_BYTES_LIMIT_IN_GB ]; then + soft_pending_bytes=$( echo $SOFT_PENDING_COMPACTION_BYTES_LIMIT_IN_GB | \ + awk '{ printf "%.0f", $1 * GB }' GB=$G ) + soft_pending_arg="--soft_pending_compaction_bytes_limit=$soft_pending_bytes" +fi + +hard_pending_arg="" +if [ ! -z $HARD_PENDING_COMPACTION_BYTES_LIMIT_IN_GB ]; then + hard_pending_bytes=$( echo $HARD_PENDING_COMPACTION_BYTES_LIMIT_IN_GB | \ + awk '{ printf "%.0f", $1 * GB }' GB=$G ) + hard_pending_arg="--hard_pending_compaction_bytes_limit=$hard_pending_bytes" +fi + +o_direct_flags="" +if [ ! -z $USE_O_DIRECT ]; then + # Some of these flags are only supported in new versions and --undefok makes that work + o_direct_flags="--use_direct_reads --use_direct_io_for_flush_and_compaction --prepopulate_block_cache=1" + bytes_per_sync=0 +else + bytes_per_sync=${BYTES_PER_SYNC:-$(( 1 * M ))} +fi + +univ_min_merge_width=${UNIVERSAL_MIN_MERGE_WIDTH:-2} +univ_max_merge_width=${UNIVERSAL_MAX_MERGE_WIDTH:-20} +univ_size_ratio=${UNIVERSAL_SIZE_RATIO:-1} +univ_max_size_amp=${UNIVERSAL_MAX_SIZE_AMP:-200} + +if [ ! -z $UNIVERSAL_ALLOW_TRIVIAL_MOVE ]; then + univ_allow_trivial_move=1 +else + univ_allow_trivial_move=0 +fi + +min_blob_size=${MIN_BLOB_SIZE:-0} +blob_file_size=${BLOB_FILE_SIZE:-$(( 256 * $M ))} +blob_compression_type=${BLOB_COMPRESSION_TYPE:-${compression_type}} +blob_gc_age_cutoff=${BLOB_GC_AGE_CUTOFF:-"0.25"} +blob_gc_force_threshold=${BLOB_GC_FORCE_THRESHOLD:-1} +blob_file_starting_level=${BLOB_FILE_STARTING_LEVEL:-0} +use_blob_cache=${USE_BLOB_CACHE:-1} +use_shared_block_and_blob_cache=${USE_SHARED_BLOCK_AND_BLOB_CACHE:-1} +blob_cache_size=${BLOB_CACHE_SIZE:-$(( 16 * $G ))} +blob_cache_numshardbits=${BLOB_CACHE_NUMSHARDBITS:-6} +prepopulate_blob_cache=${PREPOPULATE_BLOB_CACHE:-0} + +# This script still works back to RocksDB 6.0 +undef_params="\ +use_blob_cache,\ +use_shared_block_and_blob_cache,\ +blob_cache_size,blob_cache_numshardbits,\ +prepopulate_blob_cache,\ +multiread_batched,\ +cache_low_pri_pool_ratio,\ +prepopulate_block_cache" + +const_params_base=" + --undefok=$undef_params \ + --db=$DB_DIR \ + --wal_dir=$WAL_DIR \ + \ + --num=$num_keys \ + --key_size=$key_size \ + --value_size=$value_size \ + --block_size=$block_size \ + --cache_size=$cache_size \ + --cache_numshardbits=$cache_numshardbits \ + --compression_max_dict_bytes=$compression_max_dict_bytes \ + --compression_ratio=0.5 \ + --compression_type=$compression_type \ + --bytes_per_sync=$bytes_per_sync \ + $cache_meta_flags \ + $o_direct_flags \ + --benchmark_write_rate_limit=$(( 1024 * 1024 * $mb_written_per_sec )) \ + \ + --write_buffer_size=$(( $write_buffer_mb * M)) \ + --target_file_size_base=$(( $target_file_mb * M)) \ + --max_bytes_for_level_base=$(( $l1_mb * M)) \ + \ + --verify_checksum=1 \ + --delete_obsolete_files_period_micros=$((60 * M)) \ + --max_bytes_for_level_multiplier=$per_level_fanout \ + \ + --statistics=0 \ + --stats_per_interval=1 \ + --stats_interval_seconds=$stats_interval_seconds \ + --report_interval_seconds=$report_interval_seconds \ + --histogram=1 \ + \ + --memtablerep=skip_list \ + --bloom_bits=10 \ + --open_files=-1 \ + --subcompactions=$subcompactions \ + \ + $bench_args" + +level_const_params=" + $const_params_base \ + --compaction_style=0 \ + --num_levels=8 \ + --min_level_to_compress=$min_level_to_compress \ + --level_compaction_dynamic_level_bytes=true \ + --pin_l0_filter_and_index_blocks_in_cache=1 \ + $soft_pending_arg \ + $hard_pending_arg \ +" + +# These inherit level_const_params because the non-blob LSM tree uses leveled compaction. +blob_const_params=" + $level_const_params \ + --enable_blob_files=true \ + --min_blob_size=$min_blob_size \ + --blob_file_size=$blob_file_size \ + --blob_compression_type=$blob_compression_type \ + --enable_blob_garbage_collection=true \ + --blob_garbage_collection_age_cutoff=$blob_gc_age_cutoff \ + --blob_garbage_collection_force_threshold=$blob_gc_force_threshold \ + --blob_file_starting_level=$blob_file_starting_level \ + --use_blob_cache=$use_blob_cache \ + --use_shared_block_and_blob_cache=$use_shared_block_and_blob_cache \ + --blob_cache_size=$blob_cache_size \ + --blob_cache_numshardbits=$blob_cache_numshardbits \ + --prepopulate_blob_cache=$prepopulate_blob_cache \ +" + +# TODO: +# pin_l0_filter_and..., is this OK? +univ_const_params=" + $const_params_base \ + --compaction_style=1 \ + --num_levels=40 \ + --universal_compression_size_percent=$compression_size_percent \ + --pin_l0_filter_and_index_blocks_in_cache=1 \ + --universal_min_merge_width=$univ_min_merge_width \ + --universal_max_merge_width=$univ_max_merge_width \ + --universal_size_ratio=$univ_size_ratio \ + --universal_max_size_amplification_percent=$univ_max_size_amp \ + --universal_allow_trivial_move=$univ_allow_trivial_move \ +" + +if [ $compaction_style == "leveled" ]; then + const_params="$level_const_params" + l0_file_num_compaction_trigger=${LEVEL0_FILE_NUM_COMPACTION_TRIGGER:-4} + l0_slowdown_writes_trigger=${LEVEL0_SLOWDOWN_WRITES_TRIGGER:-20} + l0_stop_writes_trigger=${LEVEL0_STOP_WRITES_TRIGGER:-30} +elif [ $compaction_style == "universal" ]; then + const_params="$univ_const_params" + l0_file_num_compaction_trigger=${LEVEL0_FILE_NUM_COMPACTION_TRIGGER:-8} + l0_slowdown_writes_trigger=${LEVEL0_SLOWDOWN_WRITES_TRIGGER:-20} + l0_stop_writes_trigger=${LEVEL0_STOP_WRITES_TRIGGER:-30} +else + # compaction_style == "blob" + const_params="$blob_const_params" + l0_file_num_compaction_trigger=${LEVEL0_FILE_NUM_COMPACTION_TRIGGER:-4} + l0_slowdown_writes_trigger=${LEVEL0_SLOWDOWN_WRITES_TRIGGER:-20} + l0_stop_writes_trigger=${LEVEL0_STOP_WRITES_TRIGGER:-30} +fi + +l0_config=" + --level0_file_num_compaction_trigger=$l0_file_num_compaction_trigger \ + --level0_slowdown_writes_trigger=$l0_slowdown_writes_trigger \ + --level0_stop_writes_trigger=$l0_stop_writes_trigger" + +# You probably don't want to set both --writes and --duration +if [ $duration -gt 0 ]; then + const_params="$const_params --duration=$duration" +fi +if [ $writes -gt 0 ]; then + const_params="$const_params --writes=$writes" +fi + +params_w="$l0_config \ + --max_background_jobs=$max_background_jobs \ + --max_write_buffer_number=8 \ + $const_params" + +params_bulkload="--max_background_jobs=$max_background_jobs \ + --max_write_buffer_number=8 \ + --allow_concurrent_memtable_write=false \ + --level0_file_num_compaction_trigger=$((10 * M)) \ + --level0_slowdown_writes_trigger=$((10 * M)) \ + --level0_stop_writes_trigger=$((10 * M)) \ + $const_params " + +params_fillseq="--allow_concurrent_memtable_write=false \ + $params_w " + +# +# Tune values for level and universal compaction. +# For universal compaction, these level0_* options mean total sorted of runs in +# LSM. In level-based compaction, it means number of L0 files. +# +params_level_compact="$const_params \ + --max_background_flushes=4 \ + --max_write_buffer_number=4 \ + --level0_file_num_compaction_trigger=4 \ + --level0_slowdown_writes_trigger=16 \ + --level0_stop_writes_trigger=20" + +params_univ_compact="$const_params \ + --max_background_flushes=4 \ + --max_write_buffer_number=4 \ + --level0_file_num_compaction_trigger=8 \ + --level0_slowdown_writes_trigger=16 \ + --level0_stop_writes_trigger=20" + +tsv_header="ops_sec\tmb_sec\tlsm_sz\tblob_sz\tc_wgb\tw_amp\tc_mbps\tc_wsecs\tc_csecs\tb_rgb\tb_wgb\tusec_op\tp50\tp99\tp99.9\tp99.99\tpmax\tuptime\tstall%\tNstall\tu_cpu\ts_cpu\trss\ttest\tdate\tversion\tjob_id\tgithash" + +function get_cmd() { + output=$1 + + numa="" + if [ ! -z $NUMACTL ]; then + numa="numactl --interleave=all " + fi + + # Try to use timeout when duration is set because some tests (revrange*) hang + # for some versions (v6.10, v6.11). + timeout_cmd="" + if [ $duration -gt 0 ]; then + if hash timeout ; then + timeout_cmd="timeout $(( $duration + 600 ))" + fi + fi + + echo "/usr/bin/time -f '%e %U %S' -o $output $numa $timeout_cmd" +} + +function month_to_num() { + local date_str=$1 + date_str="${date_str/Jan/01}" + date_str="${date_str/Feb/02}" + date_str="${date_str/Mar/03}" + date_str="${date_str/Apr/04}" + date_str="${date_str/May/05}" + date_str="${date_str/Jun/06}" + date_str="${date_str/Jul/07}" + date_str="${date_str/Aug/08}" + date_str="${date_str/Sep/09}" + date_str="${date_str/Oct/10}" + date_str="${date_str/Nov/11}" + date_str="${date_str/Dec/12}" + echo $date_str +} + +function start_stats { + output=$1 + iostat -y -mx 1 >& $output.io & + vmstat 1 >& $output.vm & + # tail -1 because "ps | grep db_bench" returns 2 entries and we want the second + while :; do ps aux | grep db_bench | grep -v grep | tail -1; sleep 10; done >& $output.ps & + # This sets a global value + pspid=$! + + while :; do + b_gb=$( ls -l $DB_DIR 2> /dev/null | grep blob | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' ) + s_gb=$( ls -l $DB_DIR 2> /dev/null | grep sst | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' ) + l_gb=$( ls -l $WAL_DIR 2> /dev/null | grep log | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' ) + a_gb=$( ls -l $DB_DIR 2> /dev/null | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' ) + ts=$( date +%H%M%S ) + echo -e "${a_gb}\t${s_gb}\t${l_gb}\t${b_gb}\t${ts}" + sleep 10 + done >& $output.sizes & + # This sets a global value + szpid=$! +} + +function stop_stats { + output=$1 + kill $pspid + kill $szpid + killall iostat + killall vmstat + sleep 1 + gzip $output.io + gzip $output.vm + + am=$( sort -nk 1,1 $output.sizes | tail -1 | awk '{ print $1 }' ) + sm=$( sort -nk 2,2 $output.sizes | tail -1 | awk '{ print $2 }' ) + lm=$( sort -nk 3,3 $output.sizes | tail -1 | awk '{ print $3 }' ) + bm=$( sort -nk 4,4 $output.sizes | tail -1 | awk '{ print $4 }' ) + echo -e "max sizes (GB): $am all, $sm sst, $lm log, $bm blob" >> $output.sizes +} + +function units_as_gb { + size=$1 + units=$2 + + case $units in + MB) + echo "$size" | awk '{ printf "%.1f", $1 / 1024.0 }' + ;; + GB) + echo "$size" + ;; + TB) + echo "$size" | awk '{ printf "%.1f", $1 * 1024.0 }' + ;; + *) + echo "NA" + ;; + esac +} + +function summarize_result { + test_out=$1 + test_name=$2 + bench_name=$3 + + # In recent versions these can be found directly via db_bench --version, --build_info but + # grepping from the log lets this work on older versions. + version="$( grep "RocksDB version:" "$DB_DIR"/LOG | head -1 | awk '{ printf "%s", $5 }' )" + git_hash="$( grep "Git sha" "$DB_DIR"/LOG | head -1 | awk '{ printf "%s", substr($5, 1, 10) }' )" + + # Note that this function assumes that the benchmark executes long enough so + # that "Compaction Stats" is written to stdout at least once. If it won't + # happen then empty output from grep when searching for "Sum" will cause + # syntax errors. + date=$( grep ^Date: $test_out | awk '{ print $6 "-" $3 "-" $4 "T" $5 }' ) + my_date=$( month_to_num $date ) + uptime=$( grep ^Uptime\(secs $test_out | tail -1 | awk '{ printf "%.0f", $2 }' ) + stall_pct=$( grep "^Cumulative stall" $test_out| tail -1 | awk '{ print $5 }' ) + nstall=$( grep ^Stalls\(count\): $test_out | tail -1 | awk '{ print $2 + $6 + $10 + $14 + $18 + $20 }' ) + + if ! grep ^"$bench_name" "$test_out" > /dev/null 2>&1 ; then + echo -e "failed\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t$test_name\t$my_date\t$version\t$job_id\t$git_hash" + return + fi + + # Output formats + # V1: readrandom : 10.218 micros/op 3131616 ops/sec; 1254.3 MB/s (176144999 of 176144999 found) + # The MB/s is mssing for multireadrandom + # V1a: multireadrandom : 10.164 micros/op 3148272 ops/sec; (177099990 of 177099990 found) + # V1: overwrite : 7.939 micros/op 125963 ops/sec; 50.5 MB/s + # V2: overwrite : 7.854 micros/op 127320 ops/sec 1800.001 seconds 229176999 operations; 51.0 MB/s + + format_version=$( grep ^"$bench_name" "$test_out" \ + | awk '{ if (NF >= 10 && $8 == "seconds") { print "V2" } else { print "V1" } }' ) + if [ $format_version == "V1" ]; then + ops_sec=$( grep ^"$bench_name" "$test_out" | awk '{ print $5 }' ) + usecs_op=$( grep ^"$bench_name" "$test_out" | awk '{ printf "%.1f", $3 }' ) + if [ "$bench_name" == "multireadrandom" ]; then + mb_sec="NA" + else + mb_sec=$( grep ^"$bench_name" "$test_out" | awk '{ print $7 }' ) + fi + else + ops_sec=$( grep ^"$bench_name" "$test_out" | awk '{ print $5 }' ) + usecs_op=$( grep ^"$bench_name" "$test_out" | awk '{ printf "%.1f", $3 }' ) + mb_sec=$( grep ^"$bench_name" "$test_out" | awk '{ print $11 }' ) + fi + + # For RocksDB version 4.x there are fewer fields but this still parses correctly + # Cumulative writes: 242M writes, 242M keys, 18M commit groups, 12.9 writes per commit group, ingest: 95.96 GB, 54.69 MB/s + cum_writes_gb_orig=$( grep "^Cumulative writes" "$test_out" | tail -1 | awk '{ for (x=1; x<=NF; x++) { if ($x == "ingest:") { printf "%.1f", $(x+1) } } }' ) + cum_writes_units=$( grep "^Cumulative writes" "$test_out" | tail -1 | awk '{ for (x=1; x<=NF; x++) { if ($x == "ingest:") { print $(x+2) } } }' | sed 's/,//g' ) + cum_writes_gb=$( units_as_gb "$cum_writes_gb_orig" "$cum_writes_units" ) + + # Cumulative compaction: 1159.74 GB write, 661.03 MB/s write, 1108.89 GB read, 632.04 MB/s read, 6284.3 seconds + cmb_ps=$( grep "^Cumulative compaction" "$test_out" | tail -1 | awk '{ printf "%.1f", $6 }' ) + sum_wgb_orig=$( grep "^Cumulative compaction" "$test_out" | tail -1 | awk '{ printf "%.1f", $3 }' ) + sum_wgb_units=$( grep "^Cumulative compaction" "$test_out" | tail -1 | awk '{ print $4 }' ) + sum_wgb=$( units_as_gb "$sum_wgb_orig" "$sum_wgb_units" ) + + # Flush(GB): cumulative 97.193, interval 1.247 + flush_wgb=$( grep "^Flush(GB)" "$test_out" | tail -1 | awk '{ print $3 }' | tr ',' ' ' | awk '{ print $1 }' ) + + if [[ "$sum_wgb" == "NA" || \ + "$cum_writes_gb" == "NA" || \ + "$cum_writes_gb_orig" == "0.0" || \ + -z "$cum_writes_gb_orig" || \ + -z "$flush_wgb" ]]; then + wamp="NA" + else + wamp=$( echo "( $sum_wgb + $flush_wgb ) / $cum_writes_gb" | bc -l | awk '{ printf "%.1f", $1 }' ) + fi + + c_wsecs=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.0f", $15 }' ) + c_csecs=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.0f", $16 }' ) + + lsm_size=$( grep "^ Sum" "$test_out" | tail -1 | awk '{ printf "%.0f%s", $3, $4 }' ) + blob_size=$( grep "^Blob file count:" "$test_out" | tail -1 | awk '{ printf "%.0f%s", $7, $8 }' ) + # Remove the trailing comma from blob_size: 3.0GB, -> 3.0GB + blob_size="${blob_size/,/}" + + b_rgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.0f", $21 }' ) + b_wgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.0f", $22 }' ) + + p50=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.1f", $3 }' ) + p99=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $7 }' ) + p999=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $9 }' ) + p9999=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $11 }' ) + pmax=$( grep "^Min: " $test_out | grep Median: | grep Max: | awk '{ printf "%.0f", $6 }' ) + + # Use the last line because there might be extra lines when the db_bench process exits with an error + time_out="$test_out".time + u_cpu=$( tail -1 "$time_out" | awk '{ printf "%.1f", $2 / 1000.0 }' ) + s_cpu=$( tail -1 "$time_out" | awk '{ printf "%.1f", $3 / 1000.0 }' ) + + rss="NA" + if [ -f $test_out.stats.ps ]; then + rss=$( awk '{ printf "%.1f\n", $6 / (1024 * 1024) }' "$test_out".stats.ps | sort -n | tail -1 ) + fi + + # if the report TSV (Tab Separate Values) file does not yet exist, create it and write the header row to it + if [ ! -f "$report" ]; then + echo -e "# ops_sec - operations per second" >> "$report" + echo -e "# mb_sec - ops_sec * size-of-operation-in-MB" >> "$report" + echo -e "# lsm_sz - size of LSM tree" >> "$report" + echo -e "# blob_sz - size of BlobDB logs" >> "$report" + echo -e "# c_wgb - GB written by compaction" >> "$report" + echo -e "# w_amp - Write-amplification as (bytes written by compaction / bytes written by memtable flush)" >> "$report" + echo -e "# c_mbps - Average write rate for compaction" >> "$report" + echo -e "# c_wsecs - Wall clock seconds doing compaction" >> "$report" + echo -e "# c_csecs - CPU seconds doing compaction" >> "$report" + echo -e "# b_rgb - Blob compaction read GB" >> "$report" + echo -e "# b_wgb - Blob compaction write GB" >> "$report" + echo -e "# usec_op - Microseconds per operation" >> "$report" + echo -e "# p50, p99, p99.9, p99.99 - 50th, 99th, 99.9th, 99.99th percentile response time in usecs" >> "$report" + echo -e "# pmax - max response time in usecs" >> "$report" + echo -e "# uptime - RocksDB uptime in seconds" >> "$report" + echo -e "# stall% - Percentage of time writes are stalled" >> "$report" + echo -e "# Nstall - Number of stalls" >> "$report" + echo -e "# u_cpu - #seconds/1000 of user CPU" >> "$report" + echo -e "# s_cpu - #seconds/1000 of system CPU" >> "$report" + echo -e "# rss - max RSS in GB for db_bench process" >> "$report" + echo -e "# test - Name of test" >> "$report" + echo -e "# date - Date/time of test" >> "$report" + echo -e "# version - RocksDB version" >> "$report" + echo -e "# job_id - User-provided job ID" >> "$report" + echo -e "# githash - git hash at which db_bench was compiled" >> "$report" + echo -e $tsv_header >> "$report" + fi + + echo -e "$ops_sec\t$mb_sec\t$lsm_size\t$blob_size\t$sum_wgb\t$wamp\t$cmb_ps\t$c_wsecs\t$c_csecs\t$b_rgb\t$b_wgb\t$usecs_op\t$p50\t$p99\t$p999\t$p9999\t$pmax\t$uptime\t$stall_pct\t$nstall\t$u_cpu\t$s_cpu\t$rss\t$test_name\t$my_date\t$version\t$job_id\t$git_hash" \ + >> "$report" +} + +function run_bulkload { + # This runs with a vector memtable and the WAL disabled to load faster. It is still crash safe and the + # client can discover where to restart a load after a crash. I think this is a good way to load. + echo "Bulk loading $num_keys random keys" + log_file_name=$output_dir/benchmark_bulkload_fillrandom.log + time_cmd=$( get_cmd $log_file_name.time ) + cmd="$time_cmd ./db_bench --benchmarks=fillrandom,stats \ + --use_existing_db=0 \ + --disable_auto_compactions=1 \ + --sync=0 \ + $params_bulkload \ + --threads=1 \ + --memtablerep=vector \ + --allow_concurrent_memtable_write=false \ + --disable_wal=1 \ + --seed=$( date +%s ) \ + --report_file=${log_file_name}.r.csv \ + 2>&1 | tee -a $log_file_name" + if [[ "$job_id" != "" ]]; then + echo "Job ID: ${job_id}" > $log_file_name + echo $cmd | tee -a $log_file_name + else + echo $cmd | tee $log_file_name + fi + eval $cmd + summarize_result $log_file_name bulkload fillrandom + + echo "Compacting..." + log_file_name=$output_dir/benchmark_bulkload_compact.log + time_cmd=$( get_cmd $log_file_name.time ) + cmd="$time_cmd ./db_bench --benchmarks=compact,stats \ + --use_existing_db=1 \ + --disable_auto_compactions=1 \ + --sync=0 \ + $params_w \ + --threads=1 \ + 2>&1 | tee -a $log_file_name" + if [[ "$job_id" != "" ]]; then + echo "Job ID: ${job_id}" > $log_file_name + echo $cmd | tee -a $log_file_name + else + echo $cmd | tee $log_file_name + fi + eval $cmd +} + +# +# Parameter description: +# +# $1 - 1 if I/O statistics should be collected. +# $2 - compaction type to use (level=0, universal=1). +# $3 - number of subcompactions. +# $4 - number of maximum background compactions. +# +function run_manual_compaction_worker { + # This runs with a vector memtable and the WAL disabled to load faster. + # It is still crash safe and the client can discover where to restart a + # load after a crash. I think this is a good way to load. + echo "Bulk loading $num_keys random keys for manual compaction." + + log_file_name=$output_dir/benchmark_man_compact_fillrandom_$3.log + + if [ "$2" == "1" ]; then + extra_params=$params_univ_compact + else + extra_params=$params_level_compact + fi + + # Make sure that fillrandom uses the same compaction options as compact. + time_cmd=$( get_cmd $log_file_name.time ) + cmd="$time_cmd ./db_bench --benchmarks=fillrandom,stats \ + --use_existing_db=0 \ + --disable_auto_compactions=0 \ + --sync=0 \ + $extra_params \ + --threads=$num_threads \ + --compaction_measure_io_stats=$1 \ + --compaction_style=$2 \ + --subcompactions=$3 \ + --memtablerep=vector \ + --allow_concurrent_memtable_write=false \ + --disable_wal=1 \ + --max_background_compactions=$4 \ + --seed=$( date +%s ) \ + 2>&1 | tee -a $log_file_name" + + if [[ "$job_id" != "" ]]; then + echo "Job ID: ${job_id}" > $log_file_name + echo $cmd | tee -a $log_file_name + else + echo $cmd | tee $log_file_name + fi + eval $cmd + + summarize_result $log_file_namefillrandom_output_file man_compact_fillrandom_$3 fillrandom + + echo "Compacting with $3 subcompactions specified ..." + + log_file_name=$output_dir/benchmark_man_compact_$3.log + + # This is the part we're really interested in. Given that compact benchmark + # doesn't output regular statistics then we'll just use the time command to + # measure how long this step takes. + cmd="{ \ + time ./db_bench --benchmarks=compact,stats \ + --use_existing_db=1 \ + --disable_auto_compactions=0 \ + --sync=0 \ + $extra_params \ + --threads=$num_threads \ + --compaction_measure_io_stats=$1 \ + --compaction_style=$2 \ + --subcompactions=$3 \ + --max_background_compactions=$4 \ + ;} + 2>&1 | tee -a $log_file_name" + + if [[ "$job_id" != "" ]]; then + echo "Job ID: ${job_id}" > $log_file_name + echo $cmd | tee -a $log_file_name + else + echo $cmd | tee $log_file_name + fi + eval $cmd + + # Can't use summarize_result here. One way to analyze the results is to run + # "grep real" on the resulting log files. +} + +function run_univ_compaction { + # Always ask for I/O statistics to be measured. + io_stats=1 + + # Values: kCompactionStyleLevel = 0x0, kCompactionStyleUniversal = 0x1. + compaction_style=1 + + # Define a set of benchmarks. + subcompactions=(1 2 4 8 16) + max_background_compactions=(16 16 8 4 2) + + i=0 + total=${#subcompactions[@]} + + # Execute a set of benchmarks to cover variety of scenarios. + while [ "$i" -lt "$total" ] + do + run_manual_compaction_worker $io_stats $compaction_style ${subcompactions[$i]} \ + ${max_background_compactions[$i]} + ((i++)) + done +} + +function run_fillseq { + # This runs with a vector memtable. WAL can be either disabled or enabled + # depending on the input parameter (1 for disabled, 0 for enabled). The main + # benefit behind disabling WAL is to make loading faster. It is still crash + # safe and the client can discover where to restart a load after a crash. I + # think this is a good way to load. + + # Make sure that we'll have unique names for all the files so that data won't + # be overwritten. + if [ $1 == 1 ]; then + log_file_name="${output_dir}/benchmark_fillseq.wal_disabled.v${value_size}.log" + test_name=fillseq.wal_disabled.v${value_size} + else + log_file_name="${output_dir}/benchmark_fillseq.wal_enabled.v${value_size}.log" + test_name=fillseq.wal_enabled.v${value_size} + fi + + # For Leveled compaction hardwire this to 0 so that data that is trivial-moved + # to larger levels (3, 4, etc) will be compressed. + if [ $compaction_style == "leveled" ]; then + comp_arg="--min_level_to_compress=0" + elif [ $compaction_style == "universal" ]; then + if [ ! -z $UNIVERSAL_ALLOW_TRIVIAL_MOVE ]; then + # See GetCompressionFlush where compression_size_percent < 1 means use the default + # compression which is needed because trivial moves are enabled + comp_arg="--universal_compression_size_percent=-1" + else + # See GetCompressionFlush where compression_size_percent > 0 means no compression. + # Don't set anything here because compression_size_percent is set in univ_const_params + comp_arg="" + fi + else + # compaction_style == "blob" + comp_arg="--min_level_to_compress=0" + fi + + echo "Loading $num_keys keys sequentially" + time_cmd=$( get_cmd $log_file_name.time ) + cmd="$time_cmd ./db_bench --benchmarks=fillseq,stats \ + $params_fillseq \ + $comp_arg \ + --use_existing_db=0 \ + --sync=0 \ + --threads=1 \ + --memtablerep=vector \ + --allow_concurrent_memtable_write=false \ + --disable_wal=$1 \ + --seed=$( date +%s ) \ + --report_file=${log_file_name}.r.csv \ + 2>&1 | tee -a $log_file_name" + if [[ "$job_id" != "" ]]; then + echo "Job ID: ${job_id}" > $log_file_name + echo $cmd | tee -a $log_file_name + else + echo $cmd | tee $log_file_name + fi + start_stats $log_file_name.stats + eval $cmd + stop_stats $log_file_name.stats + + # The constant "fillseq" which we pass to db_bench is the benchmark name. + summarize_result $log_file_name $test_name fillseq +} + +function run_lsm { + # This flushes the memtable and L0 to get the LSM tree into a deterministic + # state for read-only tests that will follow. + echo "Flush memtable, wait, compact L0, wait" + job=$1 + + if [ $job = flush_mt_l0 ]; then + benchmarks=levelstats,flush,waitforcompaction,compact0,waitforcompaction,memstats,levelstats + elif [ $job = waitforcompaction ]; then + benchmarks=levelstats,waitforcompaction,memstats,levelstats + else + echo Job unknown: $job + exit $EXIT_NOT_COMPACTION_TEST + fi + + log_file_name=$output_dir/benchmark_${job}.log + time_cmd=$( get_cmd $log_file_name.time ) + cmd="$time_cmd ./db_bench --benchmarks=$benchmarks,stats \ + --use_existing_db=1 \ + --sync=0 \ + $params_w \ + --threads=1 \ + --seed=$( date +%s ) \ + 2>&1 | tee -a $log_file_name" + if [[ "$job_id" != "" ]]; then + echo "Job ID: ${job_id}" > $log_file_name + echo $cmd | tee -a $log_file_name + else + echo $cmd | tee $log_file_name + fi + start_stats $log_file_name.stats + # waitforcompaction can hang with universal (compaction_style=1) + # see bug https://github.com/facebook/rocksdb/issues/9275 + eval $cmd + stop_stats $log_file_name.stats + # Don't summarize, the log doesn't have the output needed for it +} + +function run_change { + output_name=$1 + grep_name=$2 + benchmarks=$3 + echo "Do $num_keys random $output_name" + log_file_name="$output_dir/benchmark_${output_name}.t${num_threads}.s${syncval}.log" + time_cmd=$( get_cmd $log_file_name.time ) + cmd="$time_cmd ./db_bench --benchmarks=$benchmarks,stats \ + --use_existing_db=1 \ + --sync=$syncval \ + $params_w \ + --threads=$num_threads \ + --merge_operator=\"put\" \ + --seed=$( date +%s ) \ + --report_file=${log_file_name}.r.csv \ + 2>&1 | tee -a $log_file_name" + if [[ "$job_id" != "" ]]; then + echo "Job ID: ${job_id}" > $log_file_name + echo $cmd | tee -a $log_file_name + else + echo $cmd | tee $log_file_name + fi + start_stats $log_file_name.stats + eval $cmd + stop_stats $log_file_name.stats + summarize_result $log_file_name ${output_name}.t${num_threads}.s${syncval} $grep_name +} + +function run_filluniquerandom { + echo "Loading $num_keys unique keys randomly" + log_file_name=$output_dir/benchmark_filluniquerandom.log + time_cmd=$( get_cmd $log_file_name.time ) + cmd="$time_cmd ./db_bench --benchmarks=filluniquerandom,stats \ + --use_existing_db=0 \ + --sync=0 \ + $params_w \ + --threads=1 \ + --seed=$( date +%s ) \ + --report_file=${log_file_name}.r.csv \ + 2>&1 | tee -a $log_file_name" + if [[ "$job_id" != "" ]]; then + echo "Job ID: ${job_id}" > $log_file_name + echo $cmd | tee -a $log_file_name + else + echo $cmd | tee $log_file_name + fi + start_stats $log_file_name.stats + eval $cmd + stop_stats $log_file_name.stats + summarize_result $log_file_name filluniquerandom filluniquerandom +} + +function run_readrandom { + echo "Reading $num_keys random keys" + log_file_name="${output_dir}/benchmark_readrandom.t${num_threads}.log" + time_cmd=$( get_cmd $log_file_name.time ) + cmd="$time_cmd ./db_bench --benchmarks=readrandom,stats \ + --use_existing_db=1 \ + $params_w \ + --threads=$num_threads \ + --seed=$( date +%s ) \ + --report_file=${log_file_name}.r.csv \ + 2>&1 | tee -a $log_file_name" + if [[ "$job_id" != "" ]]; then + echo "Job ID: ${job_id}" > $log_file_name + echo $cmd | tee -a $log_file_name + else + echo $cmd | tee $log_file_name + fi + start_stats $log_file_name.stats + eval $cmd + stop_stats $log_file_name.stats + summarize_result $log_file_name readrandom.t${num_threads} readrandom +} + +function run_multireadrandom { + echo "Multi-Reading $num_keys random keys" + log_file_name="${output_dir}/benchmark_multireadrandom.t${num_threads}.log" + time_cmd=$( get_cmd $log_file_name.time ) + cmd="$time_cmd ./db_bench --benchmarks=multireadrandom,stats \ + --use_existing_db=1 \ + --threads=$num_threads \ + --batch_size=10 \ + $params_w \ + --seed=$( date +%s ) \ + --report_file=${log_file_name}.r.csv \ + 2>&1 | tee -a $log_file_name" + if [[ "$job_id" != "" ]]; then + echo "Job ID: ${job_id}" > $log_file_name + echo $cmd | tee -a $log_file_name + else + echo $cmd | tee $log_file_name + fi + start_stats $log_file_name.stats + eval $cmd + stop_stats $log_file_name.stats + summarize_result $log_file_name multireadrandom.t${num_threads} multireadrandom +} + +function run_readwhile { + operation=$1 + echo "Reading $num_keys random keys while $operation" + log_file_name="${output_dir}/benchmark_readwhile${operation}.t${num_threads}.log" + time_cmd=$( get_cmd $log_file_name.time ) + cmd="$time_cmd ./db_bench --benchmarks=readwhile${operation},stats \ + --use_existing_db=1 \ + --sync=$syncval \ + $params_w \ + --threads=$num_threads \ + --merge_operator=\"put\" \ + --seed=$( date +%s ) \ + --report_file=${log_file_name}.r.csv \ + 2>&1 | tee -a $log_file_name" + if [[ "$job_id" != "" ]]; then + echo "Job ID: ${job_id}" > $log_file_name + echo $cmd | tee -a $log_file_name + else + echo $cmd | tee $log_file_name + fi + start_stats $log_file_name.stats + eval $cmd + stop_stats $log_file_name.stats + summarize_result $log_file_name readwhile${operation}.t${num_threads} readwhile${operation} +} + +function run_rangewhile { + operation=$1 + full_name=$2 + reverse_arg=$3 + log_file_name="${output_dir}/benchmark_${full_name}.t${num_threads}.log" + time_cmd=$( get_cmd $log_file_name.time ) + echo "Range scan $num_keys random keys while ${operation} for reverse_iter=${reverse_arg}" + cmd="$time_cmd ./db_bench --benchmarks=seekrandomwhile${operation},stats \ + --use_existing_db=1 \ + --sync=$syncval \ + $params_w \ + --threads=$num_threads \ + --merge_operator=\"put\" \ + --seek_nexts=$num_nexts_per_seek \ + --reverse_iterator=$reverse_arg \ + --seed=$( date +%s ) \ + --report_file=${log_file_name}.r.csv \ + 2>&1 | tee -a $log_file_name" + echo $cmd | tee $log_file_name + start_stats $log_file_name.stats + eval $cmd + stop_stats $log_file_name.stats + summarize_result $log_file_name ${full_name}.t${num_threads} seekrandomwhile${operation} +} + +function run_range { + full_name=$1 + reverse_arg=$2 + log_file_name="${output_dir}/benchmark_${full_name}.t${num_threads}.log" + time_cmd=$( get_cmd $log_file_name.time ) + echo "Range scan $num_keys random keys for reverse_iter=${reverse_arg}" + cmd="$time_cmd ./db_bench --benchmarks=seekrandom,stats \ + --use_existing_db=1 \ + $params_w \ + --threads=$num_threads \ + --seek_nexts=$num_nexts_per_seek \ + --reverse_iterator=$reverse_arg \ + --seed=$( date +%s ) \ + --report_file=${log_file_name}.r.csv \ + 2>&1 | tee -a $log_file_name" + if [[ "$job_id" != "" ]]; then + echo "Job ID: ${job_id}" > $log_file_name + echo $cmd | tee -a $log_file_name + else + echo $cmd | tee $log_file_name + fi + start_stats $log_file_name.stats + eval $cmd + stop_stats $log_file_name.stats + summarize_result $log_file_name ${full_name}.t${num_threads} seekrandom +} + +function run_randomtransaction { + echo "..." + log_file_name=$output_dir/benchmark_randomtransaction.log + time_cmd=$( get_cmd $log_file_name.time ) + cmd="$time_cmd ./db_bench $params_w --benchmarks=randomtransaction,stats \ + --num=$num_keys \ + --transaction_db \ + --threads=5 \ + --transaction_sets=5 \ + --report_file=${log_file_name}.r.csv \ + 2>&1 | tee $log_file_name" + if [[ "$job_id" != "" ]]; then + echo "Job ID: ${job_id}" > $log_file_name + echo $cmd | tee -a $log_file_name + else + echo $cmd | tee $log_file_name + fi + start_stats $log_file_name.stats + eval $cmd + stop_stats $log_file_name.stats +} + +function now() { + echo `date +"%s"` +} + + +echo "===== Benchmark =====" + +# Run!!! +IFS=',' read -a jobs <<< $bench_cmd +# shellcheck disable=SC2068 +for job in ${jobs[@]}; do + + if [ $job != debug ]; then + echo "Starting $job (ID: $job_id) at `date`" | tee -a $schedule + fi + + start=$(now) + if [ $job = bulkload ]; then + run_bulkload + elif [ $job = flush_mt_l0 ]; then + run_lsm flush_mt_l0 + elif [ $job = waitforcompaction ]; then + run_lsm waitforcompaction + elif [ $job = fillseq_disable_wal ]; then + run_fillseq 1 + elif [ $job = fillseq_enable_wal ]; then + run_fillseq 0 + elif [ $job = overwrite ]; then + run_change overwrite overwrite overwrite + elif [ $job = overwritesome ]; then + # This uses a different name for overwrite results so it can be run twice in one benchmark run. + run_change overwritesome overwrite overwrite + elif [ $job = overwriteandwait ]; then + run_change overwriteandwait overwrite overwrite,waitforcompaction + elif [ $job = updaterandom ]; then + run_change updaterandom updaterandom updaterandom + elif [ $job = mergerandom ]; then + run_change mergerandom mergerandom mergerandom + elif [ $job = filluniquerandom ]; then + run_filluniquerandom + elif [ $job = readrandom ]; then + run_readrandom + elif [ $job = multireadrandom ]; then + run_multireadrandom + elif [ $job = fwdrange ]; then + run_range $job false + elif [ $job = revrange ]; then + run_range $job true + elif [ $job = readwhilewriting ]; then + run_readwhile writing + elif [ $job = readwhilemerging ]; then + run_readwhile merging + elif [ $job = fwdrangewhilewriting ]; then + run_rangewhile writing $job false + elif [ $job = revrangewhilewriting ]; then + run_rangewhile writing $job true + elif [ $job = fwdrangewhilemerging ]; then + run_rangewhile merging $job false + elif [ $job = revrangewhilemerging ]; then + run_rangewhile merging $job true + elif [ $job = randomtransaction ]; then + run_randomtransaction + elif [ $job = universal_compaction ]; then + run_univ_compaction + elif [ $job = debug ]; then + num_keys=1000; # debug + echo "Setting num_keys to $num_keys" + else + echo "unknown job $job" + exit $EXIT_UNKNOWN_JOB + fi + end=$(now) + + if [ $job != debug ]; then + echo "Completed $job (ID: $job_id) in $((end-start)) seconds" | tee -a $schedule + fi + + echo -e $tsv_header + tail -1 $report + +done diff --git a/src/rocksdb/tools/benchmark_ci.py b/src/rocksdb/tools/benchmark_ci.py new file mode 100755 index 000000000..de9f69cf9 --- /dev/null +++ b/src/rocksdb/tools/benchmark_ci.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +"""Run benchmark_compare.sh on the most recent build, for CI +""" + +import argparse +import glob +import logging +import os +import re +import shutil +import subprocess +import sys + +logging.basicConfig(level=logging.INFO) + + +class Config: + def __init__(self, args): + self.version_file = "./include/rocksdb/version.h" + self.data_dir = os.path.expanduser(f"{args.db_dir}") + self.results_dir = os.path.expanduser(f"{args.output_dir}") + self.benchmark_script = f"{os.getcwd()}/tools/benchmark_compare.sh" + self.benchmark_cwd = f"{os.getcwd()}/tools" + + benchmark_env_keys = [ + "LD_LIBRARY_PATH", + "NUM_KEYS", + "KEY_SIZE", + "VALUE_SIZE", + "CACHE_SIZE_MB", + "DURATION_RW", + "DURATION_RO", + "MB_WRITE_PER_SEC", + "NUM_THREADS", + "COMPRESSION_TYPE", + "MIN_LEVEL_TO_COMPRESS", + "WRITE_BUFFER_SIZE_MB", + "TARGET_FILE_SIZE_BASE_MB", + "MAX_BYTES_FOR_LEVEL_BASE_MB", + "MAX_BACKGROUND_JOBS", + "CACHE_INDEX_AND_FILTER_BLOCKS", + "USE_O_DIRECT", + "STATS_INTERVAL_SECONDS", + "SUBCOMPACTIONS", + "COMPACTION_STYLE", + "CI_TESTS_ONLY", + ] + + +def read_version(config): + majorRegex = re.compile(r"#define ROCKSDB_MAJOR\s([0-9]+)") + minorRegex = re.compile(r"#define ROCKSDB_MINOR\s([0-9]+)") + patchRegex = re.compile(r"#define ROCKSDB_PATCH\s([0-9]+)") + with open(config.version_file, "r") as reader: + major = None + minor = None + patch = None + for line in reader: + if major is None: + major = majorRegex.match(line) + elif minor is None: + minor = minorRegex.match(line) + elif patch is None: + patch = patchRegex.match(line) + + if patch is not None: + break + + if patch is not None: + return (major.group(1), minor.group(1), patch.group(1)) + + # Didn't complete a match + return None + + +def prepare(version_str, config): + old_files = glob.glob(f"{config.results_dir}/{version_str}/**", recursive=True) + for f in old_files: + if os.path.isfile(f): + logging.debug(f"remove file {f}") + os.remove(f) + for f in old_files: + if os.path.isdir(f): + logging.debug(f"remove dir {f}") + os.rmdir(f) + + db_bench_vers = f"{config.benchmark_cwd}/db_bench.{version_str}" + + # Create a symlink to the db_bench executable + os.symlink(f"{os.getcwd()}/db_bench", db_bench_vers) + + +def results(version_str, config): + # Copy the report TSV file back to the top level of results + shutil.copyfile( + f"{config.results_dir}/{version_str}/report.tsv", + f"{config.results_dir}/report.tsv", + ) + + +def cleanup(version_str, config): + # Remove the symlink to the db_bench executable + db_bench_vers = f"{config.benchmark_cwd}/db_bench.{version_str}" + os.remove(db_bench_vers) + + +def get_benchmark_env(): + env = [] + for key in Config.benchmark_env_keys: + value = os.getenv(key) + if value is not None: + env.append((key, value)) + return env + + +def main(): + """Tool for running benchmark_compare.sh on the most recent build, for CI + This tool will + + (1) Work out the current version of RocksDB + (2) Run benchmark_compare with that version alone + """ + + parser = argparse.ArgumentParser( + description="benchmark_compare.sh Python wrapper for CI." + ) + + # --tsvfile is the name of the file to read results from + # --esdocument is the ElasticSearch document to push these results into + # + parser.add_argument( + "--db_dir", + default="~/tmp/rocksdb-benchmark-datadir", + help="Database directory hierarchy to use", + ) + parser.add_argument( + "--output_dir", + default="~/tmp/benchmark-results", + help="Benchmark output goes here", + ) + parser.add_argument( + "--num_keys", + default="10000", + help="Number of database keys to use in benchmark test(s) (determines size of test job)", + ) + args = parser.parse_args() + config = Config(args) + + version = read_version(config) + if version is None: + raise Exception(f"Could not read RocksDB version from {config.version_file}") + version_str = f"{version[0]}.{version[1]}.{version[2]}" + logging.info(f"Run benchmark_ci with RocksDB version {version_str}") + + prepare(version_str, config) + + try: + env = get_benchmark_env() + env.append(("NUM_KEYS", args.num_keys)) + cmd = [ + config.benchmark_script, + config.data_dir, + config.results_dir, + version_str, + ] + logging.info(f"Run {cmd} env={env} cwd={config.benchmark_cwd}") + subprocess.run(cmd, env=dict(env), cwd=config.benchmark_cwd) + + results(version_str, config) + finally: + cleanup(version_str, config) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/rocksdb/tools/benchmark_compare.sh b/src/rocksdb/tools/benchmark_compare.sh new file mode 100755 index 000000000..ef7990279 --- /dev/null +++ b/src/rocksdb/tools/benchmark_compare.sh @@ -0,0 +1,342 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# REQUIRE: db_bench binary exists in the current directory + +dbdir=$1 +odir=$2 + +# Size Constants +K=1024 +M=$((1024 * K)) + +# Dynamic loader configuration +ld_library_path=${LD_LIBRARY_PATH:-""} + +# Benchmark configuration +duration_rw=${DURATION_RW:-65} +duration_ro=${DURATION_RO:-65} +num_keys=${NUM_KEYS:-1000000} +num_threads=${NUM_THREADS:-16} +key_size=${KEY_SIZE:-20} +value_size=${VALUE_SIZE:-400} +mb_write_per_sec=${MB_WRITE_PER_SEC:-2} +ci_tests_only=${CI_TESTS_ONLY:-"false"} + +# RocksDB configuration +compression_type=${COMPRESSION_TYPE:-lz4} +subcompactions=${SUBCOMPACTIONS:-1} +write_buffer_size_mb=${WRITE_BUFFER_SIZE_MB:-32} +target_file_size_base_mb=${TARGET_FILE_SIZE_BASE_MB:-32} +max_bytes_for_level_base_mb=${MAX_BYTES_FOR_LEVEL_BASE_MB:-128} +max_background_jobs=${MAX_BACKGROUND_JOBS:-8} +stats_interval_seconds=${STATS_INTERVAL_SECONDS:-20} +cache_index_and_filter_blocks=${CACHE_INDEX_AND_FILTER_BLOCKS:-0} +# USE_O_DIRECT doesn't need a default +bytes_per_sync=${BYTES_PER_SYNC:-$(( 1 * M ))} +# CACHE_SIZE_MB doesn't need a default +min_level_to_compress=${MIN_LEVEL_TO_COMPRESS:-"-1"} + +compaction_style=${COMPACTION_STYLE:-leveled} +if [ "$compaction_style" = "leveled" ]; then + echo Use leveled compaction +elif [ "$compaction_style" = "universal" ]; then + echo Use universal compaction +elif [ "$compaction_style" = "blob" ]; then + echo Use blob compaction +else + echo COMPACTION_STYLE is :: "$COMPACTION_STYLE" :: and must be one of leveled, universal, blob + exit 1 +fi + +# Leveled compaction configuration +level0_file_num_compaction_trigger=${LEVEL0_FILE_NUM_COMPACTION_TRIGGER:-4} +level0_slowdown_writes_trigger=${LEVEL0_SLOWDOWN_WRITES_TRIGGER:-20} +level0_stop_writes_trigger=${LEVEL0_STOP_WRITES_TRIGGER:-30} +per_level_fanout=${PER_LEVEL_FANOUT:-8} + +# Universal compaction configuration +universal_min_merge_width=${UNIVERSAL_MIN_MERGE_WIDTH:-2} +universal_max_merge_width=${UNIVERSAL_MAX_MERGE_WIDTH:-20} +universal_size_ratio=${UNIVERSAL_SIZE_RATIO:-1} +universal_max_size_amp=${UNIVERSAL_MAX_SIZE_AMP:-200} +universal_compression_size_percent=${UNIVERSAL_COMPRESSION_SIZE_PERCENT:-"-1"} + +# Integrated BlobDB configuration + +min_blob_size=${MIN_BLOB_SIZE:-0} +blob_file_size=${BLOB_FILE_SIZE:-$(( 256 * M ))} +blob_compression_type=${BLOB_COMPRESSION_TYPE:-${compression_type}} +blob_gc_age_cutoff=${BLOB_GC_AGE_CUTOFF:-"0.25"} +blob_gc_force_threshold=${BLOB_GC_FORCE_THRESHOLD:-1} + +# Arguments for dynamic loading +base_args=( LD_LIBRARY_PATH="$ld_library_path" ) + +# Arguments used for all tests +base_args+=( NUM_KEYS="$num_keys" ) +base_args+=( NUM_THREADS="$num_threads" ) +base_args+=( KEY_SIZE="$key_size" ) +base_args+=( VALUE_SIZE="$value_size" ) + +base_args+=( SUBCOMPACTIONS="$subcompactions" ) +base_args+=( COMPRESSION_TYPE="$compression_type" ) +base_args+=( WRITE_BUFFER_SIZE_MB="$write_buffer_size_mb" ) +base_args+=( TARGET_FILE_SIZE_BASE_MB="$target_file_size_base_mb" ) +base_args+=( MAX_BYTES_FOR_LEVEL_BASE_MB="$max_bytes_for_level_base_mb" ) +base_args+=( MAX_BACKGROUND_JOBS="$max_background_jobs" ) +base_args+=( STATS_INTERVAL_SECONDS="$stats_interval_seconds" ) +base_args+=( CACHE_INDEX_AND_FILTER_BLOCKS="$cache_index_and_filter_blocks" ) +base_args+=( COMPACTION_STYLE="$compaction_style" ) +base_args+=( BYTES_PER_SYNC="$bytes_per_sync" ) + +if [ -n "$USE_O_DIRECT" ]; then + base_args+=( USE_O_DIRECT=1 ) +fi + +if [ -n "$NUMA" ]; then + base_args+=( NUMACTL=1 ) +fi + +if [ -n "$CACHE_SIZE_MB" ]; then + cacheb=$(( CACHE_SIZE_MB * M )) + base_args+=( CACHE_SIZE="$cacheb" ) +fi + +if [ "$compaction_style" == "leveled" ]; then + base_args+=( LEVEL0_FILE_NUM_COMPACTION_TRIGGER="$level0_file_num_compaction_trigger" ) + base_args+=( LEVEL0_SLOWDOWN_WRITES_TRIGGER="$level0_slowdown_writes_trigger" ) + base_args+=( LEVEL0_STOP_WRITES_TRIGGER="$level0_stop_writes_trigger" ) + base_args+=( PER_LEVEL_FANOUT="$per_level_fanout" ) +elif [ "$compaction_style" == "universal" ]; then + base_args+=( LEVEL0_FILE_NUM_COMPACTION_TRIGGER="$level0_file_num_compaction_trigger" ) + base_args+=( LEVEL0_SLOWDOWN_WRITES_TRIGGER="$level0_slowdown_writes_trigger" ) + base_args+=( LEVEL0_STOP_WRITES_TRIGGER="$level0_stop_writes_trigger" ) + base_args+=( UNIVERSAL_MIN_MERGE_WIDTH="$universal_min_merge_width" ) + base_args+=( UNIVERSAL_MAX_MERGE_WIDTH="$universal_max_merge_width" ) + base_args+=( UNIVERSAL_SIZE_RATIO="$universal_size_ratio" ) + base_args+=( UNIVERSAL_MAX_SIZE_AMP="$universal_max_size_amp" ) + if [ -n "$UNIVERSAL_ALLOW_TRIVIAL_MOVE" ]; then + base_args+=( UNIVERSAL_ALLOW_TRIVIAL_MOVE=1 ) + fi +else + # Inherit settings for leveled because index uses leveled LSM + base_args+=( LEVEL0_FILE_NUM_COMPACTION_TRIGGER="$level0_file_num_compaction_trigger" ) + base_args+=( LEVEL0_SLOWDOWN_WRITES_TRIGGER="$level0_slowdown_writes_trigger" ) + base_args+=( LEVEL0_STOP_WRITES_TRIGGER="$level0_stop_writes_trigger" ) + base_args+=( PER_LEVEL_FANOUT="$per_level_fanout" ) + # Then add BlobDB specific settings + base_args+=( MIN_BLOB_SIZE="$min_blob_size" ) + base_args+=( BLOB_FILE_SIZE="$blob_file_size" ) + base_args+=( BLOB_COMPRESSION_TYPE="$blob_compression_type" ) + base_args+=( BLOB_GC_AGE_CUTOFF="$blob_gc_age_cutoff" ) + base_args+=( BLOB_GC_FORCE_THRESHOLD="$blob_gc_force_threshold" ) +fi + +function usage { + echo "usage: benchmark_compare.sh db_dir output_dir version+" + echo -e "\tdb_dir\t\tcreate RocksDB database in this directory" + echo -e "\toutput_dir\twrite output from performance tests in this directory" + echo -e "\tversion+\tspace separated sequence of RocksDB versions to test." + echo -e "\nThis expects that db_bench.\$version exists in \$PWD for each version in the sequence." + echo -e "An example value for version+ is 6.23.0 6.24.0" + echo "" + echo -e "Environment variables for options" + echo -e "\tNUM_KEYS\t\t\tnumber of keys to load" + echo -e "\tKEY_SIZE\t\t\tsize of key" + echo -e "\tVALUE_SIZE\t\t\tsize of value" + echo -e "\tCACHE_SIZE_MB\t\t\tsize of block cache in MB" + echo -e "\tDURATION_RW\t\t\tnumber of seconds for which each test runs, except for read-only tests" + echo -e "\tDURATION_RO\t\t\tnumber of seconds for which each read-only test runs" + echo -e "\tMB_WRITE_PER_SEC\t\trate limit for writer that runs concurrent with queries for some tests" + echo -e "\tNUM_THREADS\t\t\tnumber of user threads" + echo -e "\tCOMPRESSION_TYPE\t\tcompression type (zstd, lz4, none, etc)" + echo -e "\tMIN_LEVEL_TO_COMPRESS\t\tmin_level_to_compress for leveled" + echo -e "\tWRITE_BUFFER_SIZE_MB\t\tsize of write buffer in MB" + echo -e "\tTARGET_FILE_SIZE_BASE_MB\tvalue for target_file_size_base in MB" + echo -e "\tMAX_BYTES_FOR_LEVEL_BASE_MB\tvalue for max_bytes_for_level_base in MB" + echo -e "\tMAX_BACKGROUND_JOBS\t\tvalue for max_background_jobs" + echo -e "\tCACHE_INDEX_AND_FILTER_BLOCKS\tvalue for cache_index_and_filter_blocks" + echo -e "\tUSE_O_DIRECT\t\t\tUse O_DIRECT for user reads and compaction" + echo -e "\tBYTES_PER_SYNC\t\t\tValue for bytes_per_sync" + echo -e "\tSTATS_INTERVAL_SECONDS\t\tvalue for stats_interval_seconds" + echo -e "\tSUBCOMPACTIONS\t\t\tvalue for subcompactions" + echo -e "\tCOMPACTION_STYLE\t\tCompaction style to use, one of: leveled, universal, blob" + echo -e "\tCI_TESTS_ONLY\t\tRun a subset of tests tailored to a CI regression job, one of: true, false (default)" + echo "" + echo -e "\tOptions specific to leveled compaction:" + echo -e "\t\tLEVEL0_FILE_NUM_COMPACTION_TRIGGER\tvalue for level0_file_num_compaction_trigger" + echo -e "\t\tLEVEL0_SLOWDOWN_WRITES_TRIGGER\t\tvalue for level0_slowdown_writes_trigger" + echo -e "\t\tLEVEL0_STOP_WRITES_TRIGGER\t\tvalue for level0_stop_writes_trigger" + echo -e "\t\tPER_LEVEL_FANOUT\t\t\tvalue for max_bytes_for_level_multiplier" + echo "" + echo -e "\tOptions specific to universal compaction:" + echo -e "\t\tSee LEVEL0_*_TRIGGER above" + echo -e "\t\tUNIVERSAL_MIN_MERGE_WIDTH\t\tvalue of min_merge_width option for universal" + echo -e "\t\tUNIVERSAL_MAX_MERGE_WIDTH\t\tvalue of min_merge_width option for universal" + echo -e "\t\tUNIVERSAL_SIZE_RATIO\t\t\tvalue of size_ratio option for universal" + echo -e "\t\tUNIVERSAL_MAX_SIZE_AMP\t\t\tmax_size_amplification_percent for universal" + echo -e "\t\tUNIVERSAL_ALLOW_TRIVIAL_MOVE\t\tSet allow_trivial_move to true for universal, default is false" + echo -e "\t\tUNIVERSAL_COMPRESSION_SIZE_PERCENT\tpercentage of LSM tree that should be compressed" + echo "" + echo -e "\tOptions for integrated BlobDB:" + echo -e "\t\tMIN_BLOB_SIZE\t\t\t\tvalue for min_blob_size" + echo -e "\t\tBLOB_FILE_SIZE\t\t\t\tvalue for blob_file_size" + echo -e "\t\tBLOB_COMPRESSION_TYPE\t\t\tvalue for blob_compression_type" + echo -e "\t\tBLOB_GC_AGE_CUTOFF\t\t\tvalue for blog_garbage_collection_age_cutoff" + echo -e "\t\tBLOB_GC_FORCE_THRESHOLD\t\t\tvalue for blog_garbage_collection_force_threshold" +} + +function dump_env { + echo "Base args" > "$odir"/args + echo "${base_args[@]}" | tr ' ' '\n' >> "$odir"/args + + echo -e "\nOther args" >> "$odir"/args + echo -e "dbdir\t$dbdir" >> "$odir"/args + echo -e "duration_rw\t$duration_rw" >> "$odir"/args + echo -e "duration_ro\t$duration_ro" >> "$odir"/args + echo -e "per_level_fanout\t$per_level_fanout" >> "$odir"/args + + echo -e "\nargs_load:" >> "$odir"/args + echo "${args_load[@]}" | tr ' ' '\n' >> "$odir"/args + echo -e "\nargs_nolim:" >> "$odir"/args + echo "${args_nolim[@]}" | tr ' ' '\n' >> "$odir"/args + echo -e "\nargs_lim:" >> "$odir"/args + echo "${args_lim[@]}" | tr ' ' '\n' >> "$odir"/args +} + +if [ $# -lt 3 ]; then + usage + echo + echo "Need at least 3 arguments" + exit 1 +fi + +shift 2 + +mkdir -p "$odir" + +echo Test versions: "$@" +echo Test versions: "$@" >> "$odir"/args + +for v in "$@" ; do + my_odir="$odir"/"$v" + + if [ -d "$my_odir" ]; then + echo Exiting because the output directory exists: "$my_odir" + exit 1 + fi + + args_common=("${base_args[@]}") + + args_common+=( OUTPUT_DIR="$my_odir" DB_DIR="$dbdir" WAL_DIR="$dbdir" DB_BENCH_NO_SYNC=1 ) + + if [ "$compaction_style" == "leveled" ]; then + args_common+=( MIN_LEVEL_TO_COMPRESS="$min_level_to_compress" ) + elif [ "$compaction_style" == "universal" ]; then + args_common+=( UNIVERSAL=1 COMPRESSION_SIZE_PERCENT="$universal_compression_size_percent" ) + else + args_common+=( MIN_LEVEL_TO_COMPRESS="$min_level_to_compress" ) + fi + + args_load=("${args_common[@]}") + + args_nolim=("${args_common[@]}") + + args_lim=("${args_nolim[@]}") + args_lim+=( MB_WRITE_PER_SEC="$mb_write_per_sec" ) + + dump_env + + echo Run benchmark for "$v" at "$( date )" with results at "$my_odir" + rm -f db_bench + echo ln -s db_bench."$v" db_bench + ln -s db_bench."$v" db_bench + + find "$dbdir" -type f -exec rm \{\} \; + + # Load in key order + echo env "${args_load[@]}" bash ./benchmark.sh fillseq_disable_wal + env -i "${args_load[@]}" bash ./benchmark.sh fillseq_disable_wal + + # Read-only tests. The LSM tree shape is in a deterministic state if trivial move + # was used during the load. + + # Add revrange with a fixed duration and hardwired number of keys and threads to give + # compaction debt leftover from fillseq a chance at being removed. Not using waitforcompaction + # here because it isn't supported on older db_bench versions. + env -i "${args_nolim[@]}" DURATION=300 NUM_KEYS=100 NUM_THREADS=1 bash ./benchmark.sh revrange + env -i "${args_nolim[@]}" DURATION="$duration_ro" bash ./benchmark.sh readrandom + + # Skipped for CI - a single essentail readrandom is enough to set up for other tests + if [ "$ci_tests_only" != "true" ]; then + env -i "${args_nolim[@]}" DURATION="$duration_ro" bash ./benchmark.sh fwdrange + env -i "${args_lim[@]}" DURATION="$duration_ro" bash ./benchmark.sh multireadrandom --multiread_batched + else + echo "CI_TESTS_ONLY is set, skipping optional read steps." + fi + + # Write 10% of the keys. The goal is to randomize keys prior to Lmax + p10=$( echo "$num_keys" "$num_threads" | awk '{ printf "%.0f", $1 / $2 / 10.0 }' ) + env -i "${args_nolim[@]}" WRITES="$p10" bash ./benchmark.sh overwritesome + + if [ "$compaction_style" == "leveled" ]; then + # These are not supported by older versions + # Flush memtable & L0 to get LSM tree into deterministic state + env -i "${args_nolim[@]}" bash ./benchmark.sh flush_mt_l0 + elif [ "$compaction_style" == "universal" ]; then + # For universal don't compact L0 as can have too many sorted runs + # waitforcompaction can hang, see https://github.com/facebook/rocksdb/issues/9275 + # While this is disabled the test that follows will have more variance from compaction debt. + # env -i "${args_nolim[@]}" bash ./benchmark.sh waitforcompaction + echo TODO enable when waitforcompaction hang is fixed + else + # These are not supported by older versions + # Flush memtable & L0 to get LSM tree into deterministic state + env -i "${args_nolim[@]}" bash ./benchmark.sh flush_mt_l0 + fi + + # Read-mostly tests with a rate-limited writer + env -i "${args_lim[@]}" DURATION="$duration_rw" bash ./benchmark.sh revrangewhilewriting + env -i "${args_lim[@]}" DURATION="$duration_rw" bash ./benchmark.sh fwdrangewhilewriting + env -i "${args_lim[@]}" DURATION="$duration_rw" bash ./benchmark.sh readwhilewriting + + # Write-only tests + + # This creates much compaction debt which will be a problem for tests added after it. + # Also, the compaction stats measured at test end can underestimate write-amp depending + # on how much compaction debt is allowed. + if [ "$compaction_style" == "leveled" ] && ./db_bench --benchmarks=waitforcompaction ; then + # Use waitforcompaction to get more accurate write-amp measurement + env -i "${args_nolim[@]}" DURATION="$duration_rw" bash ./benchmark.sh overwriteandwait + else + # waitforcompaction hangs with universal, see https://github.com/facebook/rocksdb/issues/9275 + env -i "${args_nolim[@]}" DURATION="$duration_rw" bash ./benchmark.sh overwrite + fi + + cp "$dbdir"/LOG* "$my_odir" + gzip -9 "$my_odir"/LOG* + +done + +# Generate a file that groups lines from the same test for all versions +basev=$1 +nlines=$( awk '/^ops_sec/,/END/' "$odir"/"$basev"/report.tsv | grep -v ops_sec | wc -l ) +hline=$( awk '/^ops_sec/ { print NR }' "$odir"/"$basev"/report.tsv ) +sline=$(( hline + 1 )) +eline=$(( sline + nlines - 1 )) + +sum_file="$odir"/summary.tsv + +for v in "$@" ; do + echo "$odir"/"$v"/report.tsv +done >> "$sum_file" +echo >> "$sum_file" + +for x in $( seq "$sline" "$eline" ); do + awk '{ if (NR == lno) { print $0 } }' lno="$hline" "$odir"/"$basev"/report.tsv >> "$sum_file" + for v in "$@" ; do + r="$odir"/"$v"/report.tsv + awk '{ if (NR == lno) { print $0 } }' lno="$x" "$r" >> "$sum_file" + done +echo >> "$sum_file" +done diff --git a/src/rocksdb/tools/benchmark_leveldb.sh b/src/rocksdb/tools/benchmark_leveldb.sh new file mode 100755 index 000000000..069b53a9f --- /dev/null +++ b/src/rocksdb/tools/benchmark_leveldb.sh @@ -0,0 +1,187 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# REQUIRE: db_bench binary exists in the current directory +# +# This should be used with the LevelDB fork listed here to use additional test options. +# For more details on the changes see the blog post listed below. +# https://github.com/mdcallag/leveldb-1 +# http://smalldatum.blogspot.com/2015/04/comparing-leveldb-and-rocksdb-take-2.html + +if [ $# -ne 1 ]; then + echo -n "./benchmark.sh [fillseq/overwrite/readrandom/readwhilewriting]" + exit 0 +fi + +# size constants +K=1024 +M=$((1024 * K)) +G=$((1024 * M)) + +if [ -z $DB_DIR ]; then + echo "DB_DIR is not defined" + exit 0 +fi + +output_dir=${OUTPUT_DIR:-/tmp/} +if [ ! -d $output_dir ]; then + mkdir -p $output_dir +fi + +# all multithreaded tests run with sync=1 unless +# $DB_BENCH_NO_SYNC is defined +syncval="1" +if [ ! -z $DB_BENCH_NO_SYNC ]; then + echo "Turning sync off for all multithreaded tests" + syncval="0"; +fi + +num_threads=${NUM_THREADS:-16} +# Only for *whilewriting, *whilemerging +writes_per_second=${WRITES_PER_SECOND:-$((10 * K))} +cache_size=${CACHE_SIZE:-$((1 * G))} + +num_keys=${NUM_KEYS:-$((1 * G))} +key_size=20 +value_size=${VALUE_SIZE:-400} +block_size=${BLOCK_SIZE:-4096} + +const_params=" + --db=$DB_DIR \ + \ + --num=$num_keys \ + --value_size=$value_size \ + --cache_size=$cache_size \ + --compression_ratio=0.5 \ + \ + --write_buffer_size=$((2 * M)) \ + \ + --histogram=1 \ + \ + --bloom_bits=10 \ + --open_files=$((20 * K))" + +params_w="$const_params " + +function summarize_result { + test_out=$1 + test_name=$2 + bench_name=$3 + nthr=$4 + + usecs_op=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $3 }' ) + mb_sec=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $5 }' ) + ops=$( grep "^Count:" $test_out | awk '{ print $2 }' ) + ops_sec=$( echo "scale=0; (1000000.0 * $nthr) / $usecs_op" | bc ) + avg=$( grep "^Count:" $test_out | awk '{ printf "%.1f", $4 }' ) + p50=$( grep "^Min:" $test_out | awk '{ printf "%.1f", $4 }' ) + echo -e "$ops_sec\t$mb_sec\t$usecs_op\t$avg\t$p50\t$test_name" \ + >> $output_dir/report.txt +} + +function run_fillseq { + # This runs with a vector memtable and the WAL disabled to load faster. It is still crash safe and the + # client can discover where to restart a load after a crash. I think this is a good way to load. + echo "Loading $num_keys keys sequentially" + cmd="./db_bench --benchmarks=fillseq \ + --use_existing_db=0 \ + --sync=0 \ + $params_w \ + --threads=1 \ + --seed=$( date +%s ) \ + 2>&1 | tee -a $output_dir/benchmark_fillseq.v${value_size}.log" + echo $cmd | tee $output_dir/benchmark_fillseq.v${value_size}.log + eval $cmd + summarize_result $output_dir/benchmark_fillseq.v${value_size}.log fillseq.v${value_size} fillseq 1 +} + +function run_change { + operation=$1 + echo "Do $num_keys random $operation" + out_name="benchmark_${operation}.t${num_threads}.s${syncval}.log" + cmd="./db_bench --benchmarks=$operation \ + --use_existing_db=1 \ + --sync=$syncval \ + $params_w \ + --threads=$num_threads \ + --seed=$( date +%s ) \ + 2>&1 | tee -a $output_dir/${out_name}" + echo $cmd | tee $output_dir/${out_name} + eval $cmd + summarize_result $output_dir/${out_name} ${operation}.t${num_threads}.s${syncval} $operation $num_threads +} + +function run_readrandom { + echo "Reading $num_keys random keys" + out_name="benchmark_readrandom.t${num_threads}.log" + cmd="./db_bench --benchmarks=readrandom \ + --use_existing_db=1 \ + $params_w \ + --threads=$num_threads \ + --seed=$( date +%s ) \ + 2>&1 | tee -a $output_dir/${out_name}" + echo $cmd | tee $output_dir/${out_name} + eval $cmd + summarize_result $output_dir/${out_name} readrandom.t${num_threads} readrandom $num_threads +} + +function run_readwhile { + operation=$1 + echo "Reading $num_keys random keys while $operation" + out_name="benchmark_readwhile${operation}.t${num_threads}.log" + cmd="./db_bench --benchmarks=readwhile${operation} \ + --use_existing_db=1 \ + --sync=$syncval \ + $params_w \ + --threads=$num_threads \ + --writes_per_second=$writes_per_second \ + --seed=$( date +%s ) \ + 2>&1 | tee -a $output_dir/${out_name}" + echo $cmd | tee $output_dir/${out_name} + eval $cmd + summarize_result $output_dir/${out_name} readwhile${operation}.t${num_threads} readwhile${operation} $num_threads +} + +function now() { + echo `date +"%s"` +} + +report="$output_dir/report.txt" +schedule="$output_dir/schedule.txt" + +echo "===== Benchmark =====" + +# Run!!! +IFS=',' read -a jobs <<< $1 +# shellcheck disable=SC2068 +for job in ${jobs[@]}; do + + if [ $job != debug ]; then + echo "Start $job at `date`" | tee -a $schedule + fi + + start=$(now) + if [ $job = fillseq ]; then + run_fillseq + elif [ $job = overwrite ]; then + run_change overwrite + elif [ $job = readrandom ]; then + run_readrandom + elif [ $job = readwhilewriting ]; then + run_readwhile writing + elif [ $job = debug ]; then + num_keys=1000; # debug + echo "Setting num_keys to $num_keys" + else + echo "unknown job $job" + exit + fi + end=$(now) + + if [ $job != debug ]; then + echo "Complete $job in $((end-start)) seconds" | tee -a $schedule + fi + + echo -e "ops/sec\tmb/sec\tusec/op\tavg\tp50\tTest" + tail -1 $output_dir/report.txt + +done diff --git a/src/rocksdb/tools/blob_dump.cc b/src/rocksdb/tools/blob_dump.cc new file mode 100644 index 000000000..1f75eb20d --- /dev/null +++ b/src/rocksdb/tools/blob_dump.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include <getopt.h> + +#include <cstdio> +#include <string> +#include <unordered_map> + +#include "utilities/blob_db/blob_dump_tool.h" + +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::blob_db::BlobDumpTool; + +int main(int argc, char** argv) { + using DisplayType = BlobDumpTool::DisplayType; + const std::unordered_map<std::string, DisplayType> display_types = { + {"none", DisplayType::kNone}, + {"raw", DisplayType::kRaw}, + {"hex", DisplayType::kHex}, + {"detail", DisplayType::kDetail}, + }; + const struct option options[] = { + {"help", no_argument, nullptr, 'h'}, + {"file", required_argument, nullptr, 'f'}, + {"show_key", optional_argument, nullptr, 'k'}, + {"show_blob", optional_argument, nullptr, 'b'}, + {"show_uncompressed_blob", optional_argument, nullptr, 'r'}, + {"show_summary", optional_argument, nullptr, 's'}, + }; + DisplayType show_key = DisplayType::kRaw; + DisplayType show_blob = DisplayType::kNone; + DisplayType show_uncompressed_blob = DisplayType::kNone; + bool show_summary = false; + std::string file; + while (true) { + int c = getopt_long(argc, argv, "hk::b::f:", options, nullptr); + if (c < 0) { + break; + } + std::string arg_str(optarg ? optarg : ""); + switch (c) { + case 'h': + fprintf(stdout, + "Usage: blob_dump --file=filename " + "[--show_key[=none|raw|hex|detail]] " + "[--show_blob[=none|raw|hex|detail]] " + "[--show_uncompressed_blob[=none|raw|hex|detail]] " + "[--show_summary]\n"); + return 0; + case 'f': + file = optarg; + break; + case 'k': + if (optarg) { + if (display_types.count(arg_str) == 0) { + fprintf(stderr, "Unrecognized key display type.\n"); + return -1; + } + show_key = display_types.at(arg_str); + } + break; + case 'b': + if (optarg) { + if (display_types.count(arg_str) == 0) { + fprintf(stderr, "Unrecognized blob display type.\n"); + return -1; + } + show_blob = display_types.at(arg_str); + } else { + show_blob = DisplayType::kHex; + } + break; + case 'r': + if (optarg) { + if (display_types.count(arg_str) == 0) { + fprintf(stderr, "Unrecognized blob display type.\n"); + return -1; + } + show_uncompressed_blob = display_types.at(arg_str); + } else { + show_uncompressed_blob = DisplayType::kHex; + } + break; + case 's': + show_summary = true; + break; + default: + fprintf(stderr, "Unrecognized option.\n"); + return -1; + } + } + BlobDumpTool tool; + Status s = + tool.Run(file, show_key, show_blob, show_uncompressed_blob, show_summary); + if (!s.ok()) { + fprintf(stderr, "Failed: %s\n", s.ToString().c_str()); + return -1; + } + return 0; +} +#else +#include <stdio.h> +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "Not supported in lite mode.\n"); + return -1; +} +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/block_cache_analyzer/__init__.py b/src/rocksdb/tools/block_cache_analyzer/__init__.py new file mode 100644 index 000000000..8dbe96a78 --- /dev/null +++ b/src/rocksdb/tools/block_cache_analyzer/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py new file mode 100644 index 000000000..67307df53 --- /dev/null +++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py @@ -0,0 +1,2000 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import gc +import heapq +import random +import sys +import time +from collections import OrderedDict +from os import path + +import numpy as np + + +kSampleSize = 64 # The sample size used when performing eviction. +kMicrosInSecond = 1000000 +kSecondsInMinute = 60 +kSecondsInHour = 3600 + + +class TraceRecord: + """ + A trace record represents a block access. + It holds the same struct as BlockCacheTraceRecord in + trace_replay/block_cache_tracer.h + """ + + def __init__( + self, + access_time, + block_id, + block_type, + block_size, + cf_id, + cf_name, + level, + fd, + caller, + no_insert, + get_id, + key_id, + kv_size, + is_hit, + referenced_key_exist_in_block, + num_keys_in_block, + table_id, + seq_number, + block_key_size, + key_size, + block_offset_in_file, + next_access_seq_no, + ): + self.access_time = access_time + self.block_id = block_id + self.block_type = block_type + self.block_size = block_size + block_key_size + self.cf_id = cf_id + self.cf_name = cf_name + self.level = level + self.fd = fd + self.caller = caller + if no_insert == 1: + self.no_insert = True + else: + self.no_insert = False + self.get_id = get_id + self.key_id = key_id + self.kv_size = kv_size + if is_hit == 1: + self.is_hit = True + else: + self.is_hit = False + if referenced_key_exist_in_block == 1: + self.referenced_key_exist_in_block = True + else: + self.referenced_key_exist_in_block = False + self.num_keys_in_block = num_keys_in_block + self.table_id = table_id + self.seq_number = seq_number + self.block_key_size = block_key_size + self.key_size = key_size + self.block_offset_in_file = block_offset_in_file + self.next_access_seq_no = next_access_seq_no + + +class CacheEntry: + """A cache entry stored in the cache.""" + + def __init__( + self, + value_size, + cf_id, + level, + block_type, + table_id, + access_number, + time_s, + num_hits=0, + ): + self.value_size = value_size + self.last_access_number = access_number + self.num_hits = num_hits + self.cf_id = 0 + self.level = level + self.block_type = block_type + self.last_access_time = time_s + self.insertion_time = time_s + self.table_id = table_id + + def __repr__(self): + """Debug string.""" + return "(s={},last={},hits={},cf={},l={},bt={})\n".format( + self.value_size, + self.last_access_number, + self.num_hits, + self.cf_id, + self.level, + self.block_type, + ) + + def cost_class(self, cost_class_label): + if cost_class_label == "table_bt": + return "{}-{}".format(self.table_id, self.block_type) + elif cost_class_label == "table": + return "{}".format(self.table_id) + elif cost_class_label == "bt": + return "{}".format(self.block_type) + elif cost_class_label == "cf": + return "{}".format(self.cf_id) + elif cost_class_label == "cf_bt": + return "{}-{}".format(self.cf_id, self.block_type) + elif cost_class_label == "table_level_bt": + return "{}-{}-{}".format(self.table_id, self.level, self.block_type) + assert False, "Unknown cost class label {}".format(cost_class_label) + return None + + +class HashEntry: + """A hash entry stored in a hash table.""" + + def __init__(self, key, hash, value): + self.key = key + self.hash = hash + self.value = value + + def __repr__(self): + return "k={},h={},v=[{}]".format(self.key, self.hash, self.value) + + +class HashTable: + """ + A custom implementation of hash table to support fast random sampling. + It is closed hashing and uses chaining to resolve hash conflicts. + It grows/shrinks the hash table upon insertion/deletion to support + fast lookups and random samplings. + """ + + def __init__(self): + self.initial_size = 32 + self.table = [None] * self.initial_size + self.elements = 0 + + def random_sample(self, sample_size): + """Randomly sample 'sample_size' hash entries from the table.""" + samples = [] + index = random.randint(0, len(self.table) - 1) + pos = index + # Starting from index, adding hash entries to the sample list until + # sample_size is met or we ran out of entries. + while True: + if self.table[pos] is not None: + for i in range(len(self.table[pos])): + if self.table[pos][i] is None: + continue + samples.append(self.table[pos][i]) + if len(samples) == sample_size: + break + pos += 1 + pos = pos % len(self.table) + if pos == index or len(samples) == sample_size: + break + assert len(samples) <= sample_size + return samples + + def __repr__(self): + all_entries = [] + for i in range(len(self.table)): + if self.table[i] is None: + continue + for j in range(len(self.table[i])): + if self.table[i][j] is not None: + all_entries.append(self.table[i][j]) + return "{}".format(all_entries) + + def values(self): + all_values = [] + for i in range(len(self.table)): + if self.table[i] is None: + continue + for j in range(len(self.table[i])): + if self.table[i][j] is not None: + all_values.append(self.table[i][j].value) + return all_values + + def __len__(self): + return self.elements + + def insert(self, key, hash, value): + """ + Insert a hash entry in the table. Replace the old entry if it already + exists. + """ + self.grow() + inserted = False + index = hash % len(self.table) + if self.table[index] is None: + self.table[index] = [] + # Search for the entry first. + for i in range(len(self.table[index])): + if self.table[index][i] is None: + continue + if self.table[index][i].hash == hash and self.table[index][i].key == key: + # The entry already exists in the table. + self.table[index][i] = HashEntry(key, hash, value) + return + + # Find an empty slot. + for i in range(len(self.table[index])): + if self.table[index][i] is None: + self.table[index][i] = HashEntry(key, hash, value) + inserted = True + break + if not inserted: + self.table[index].append(HashEntry(key, hash, value)) + self.elements += 1 + + def resize(self, new_size): + if new_size == len(self.table): + return + if new_size < self.initial_size: + return + if self.elements < 100: + return + new_table = [None] * new_size + # Copy 'self.table' to new_table. + for i in range(len(self.table)): + entries = self.table[i] + if entries is None: + continue + for j in range(len(entries)): + if entries[j] is None: + continue + index = entries[j].hash % new_size + if new_table[index] is None: + new_table[index] = [] + new_table[index].append(entries[j]) + self.table = new_table + del new_table + # Manually call python gc here to free the memory as 'self.table' + # might be very large. + gc.collect() + + def grow(self): + if self.elements < 4 * len(self.table): + return + new_size = int(len(self.table) * 1.5) + self.resize(new_size) + + def delete(self, key, hash): + index = hash % len(self.table) + deleted = False + deleted_entry = None + if self.table[index] is None: + return + for i in range(len(self.table[index])): + if ( + self.table[index][i] is not None + and self.table[index][i].hash == hash + and self.table[index][i].key == key + ): + deleted_entry = self.table[index][i] + self.table[index][i] = None + self.elements -= 1 + deleted = True + break + if deleted: + self.shrink() + return deleted_entry + + def shrink(self): + if self.elements * 2 >= len(self.table): + return + new_size = int(len(self.table) * 0.7) + self.resize(new_size) + + def lookup(self, key, hash): + index = hash % len(self.table) + if self.table[index] is None: + return None + for i in range(len(self.table[index])): + if ( + self.table[index][i] is not None + and self.table[index][i].hash == hash + and self.table[index][i].key == key + ): + return self.table[index][i].value + return None + + +class MissRatioStats: + def __init__(self, time_unit): + self.num_misses = 0 + self.num_accesses = 0 + self.time_unit = time_unit + self.time_misses = {} + self.time_miss_bytes = {} + self.time_accesses = {} + + def update_metrics(self, access_time, is_hit, miss_bytes): + access_time /= kMicrosInSecond * self.time_unit + self.num_accesses += 1 + if access_time not in self.time_accesses: + self.time_accesses[access_time] = 0 + self.time_accesses[access_time] += 1 + if not is_hit: + self.num_misses += 1 + if access_time not in self.time_misses: + self.time_misses[access_time] = 0 + self.time_miss_bytes[access_time] = 0 + self.time_misses[access_time] += 1 + self.time_miss_bytes[access_time] += miss_bytes + + def reset_counter(self): + self.num_misses = 0 + self.num_accesses = 0 + self.time_miss_bytes.clear() + self.time_misses.clear() + self.time_accesses.clear() + + def compute_miss_bytes(self): + miss_bytes = [] + for at in self.time_miss_bytes: + miss_bytes.append(self.time_miss_bytes[at]) + miss_bytes = sorted(miss_bytes) + avg_miss_bytes = 0 + p95_miss_bytes = 0 + for i in range(len(miss_bytes)): + avg_miss_bytes += float(miss_bytes[i]) / float(len(miss_bytes)) + + p95_index = min(int(0.95 * float(len(miss_bytes))), len(miss_bytes) - 1) + p95_miss_bytes = miss_bytes[p95_index] + return avg_miss_bytes, p95_miss_bytes + + def miss_ratio(self): + return float(self.num_misses) * 100.0 / float(self.num_accesses) + + def write_miss_timeline( + self, cache_type, cache_size, target_cf_name, result_dir, start, end + ): + start /= kMicrosInSecond * self.time_unit + end /= kMicrosInSecond * self.time_unit + header_file_path = "{}/header-ml-miss-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name + ) + if not path.exists(header_file_path): + with open(header_file_path, "w+") as header_file: + header = "time" + for trace_time in range(start, end): + header += ",{}".format(trace_time) + header_file.write(header + "\n") + file_path = "{}/data-ml-miss-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name + ) + with open(file_path, "w+") as file: + row = "{}".format(cache_type) + for trace_time in range(start, end): + row += ",{}".format(self.time_misses.get(trace_time, 0)) + file.write(row + "\n") + + def write_miss_ratio_timeline( + self, cache_type, cache_size, target_cf_name, result_dir, start, end + ): + start /= kMicrosInSecond * self.time_unit + end /= kMicrosInSecond * self.time_unit + header_file_path = "{}/header-ml-miss-ratio-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name + ) + if not path.exists(header_file_path): + with open(header_file_path, "w+") as header_file: + header = "time" + for trace_time in range(start, end): + header += ",{}".format(trace_time) + header_file.write(header + "\n") + file_path = "{}/data-ml-miss-ratio-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name + ) + with open(file_path, "w+") as file: + row = "{}".format(cache_type) + for trace_time in range(start, end): + naccesses = self.time_accesses.get(trace_time, 0) + miss_ratio = 0 + if naccesses > 0: + miss_ratio = float( + self.time_misses.get(trace_time, 0) * 100.0 + ) / float(naccesses) + row += ",{0:.2f}".format(miss_ratio) + file.write(row + "\n") + + +class PolicyStats: + def __init__(self, time_unit, policies): + self.time_selected_polices = {} + self.time_accesses = {} + self.policy_names = {} + self.time_unit = time_unit + for i in range(len(policies)): + self.policy_names[i] = policies[i].policy_name() + + def update_metrics(self, access_time, selected_policy): + access_time /= kMicrosInSecond * self.time_unit + if access_time not in self.time_accesses: + self.time_accesses[access_time] = 0 + self.time_accesses[access_time] += 1 + if access_time not in self.time_selected_polices: + self.time_selected_polices[access_time] = {} + policy_name = self.policy_names[selected_policy] + if policy_name not in self.time_selected_polices[access_time]: + self.time_selected_polices[access_time][policy_name] = 0 + self.time_selected_polices[access_time][policy_name] += 1 + + def write_policy_timeline( + self, cache_type, cache_size, target_cf_name, result_dir, start, end + ): + start /= kMicrosInSecond * self.time_unit + end /= kMicrosInSecond * self.time_unit + header_file_path = "{}/header-ml-policy-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name + ) + if not path.exists(header_file_path): + with open(header_file_path, "w+") as header_file: + header = "time" + for trace_time in range(start, end): + header += ",{}".format(trace_time) + header_file.write(header + "\n") + file_path = "{}/data-ml-policy-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name + ) + with open(file_path, "w+") as file: + for policy in self.policy_names: + policy_name = self.policy_names[policy] + row = "{}-{}".format(cache_type, policy_name) + for trace_time in range(start, end): + row += ",{}".format( + self.time_selected_polices.get(trace_time, {}).get( + policy_name, 0 + ) + ) + file.write(row + "\n") + + def write_policy_ratio_timeline( + self, cache_type, cache_size, target_cf_name, file_path, start, end + ): + start /= kMicrosInSecond * self.time_unit + end /= kMicrosInSecond * self.time_unit + header_file_path = "{}/header-ml-policy-ratio-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name + ) + if not path.exists(header_file_path): + with open(header_file_path, "w+") as header_file: + header = "time" + for trace_time in range(start, end): + header += ",{}".format(trace_time) + header_file.write(header + "\n") + file_path = "{}/data-ml-policy-ratio-timeline-{}-{}-{}-{}".format( + result_dir, self.time_unit, cache_type, cache_size, target_cf_name + ) + with open(file_path, "w+") as file: + for policy in self.policy_names: + policy_name = self.policy_names[policy] + row = "{}-{}".format(cache_type, policy_name) + for trace_time in range(start, end): + naccesses = self.time_accesses.get(trace_time, 0) + ratio = 0 + if naccesses > 0: + ratio = float( + self.time_selected_polices.get(trace_time, {}).get( + policy_name, 0 + ) + * 100.0 + ) / float(naccesses) + row += ",{0:.2f}".format(ratio) + file.write(row + "\n") + + +class Policy(object): + """ + A policy maintains a set of evicted keys. It returns a reward of one to + itself if it has not evicted a missing key. Otherwise, it gives itself 0 + reward. + """ + + def __init__(self): + self.evicted_keys = {} + + def evict(self, key, max_size): + self.evicted_keys[key] = 0 + + def delete(self, key): + self.evicted_keys.pop(key, None) + + def prioritize_samples(self, samples, auxilliary_info): + raise NotImplementedError + + def policy_name(self): + raise NotImplementedError + + def generate_reward(self, key): + if key in self.evicted_keys: + return 0 + return 1 + + +class LRUPolicy(Policy): + def prioritize_samples(self, samples, auxilliary_info): + return sorted( + samples, + cmp=lambda e1, e2: e1.value.last_access_number + - e2.value.last_access_number, + ) + + def policy_name(self): + return "lru" + + +class MRUPolicy(Policy): + def prioritize_samples(self, samples, auxilliary_info): + return sorted( + samples, + cmp=lambda e1, e2: e2.value.last_access_number + - e1.value.last_access_number, + ) + + def policy_name(self): + return "mru" + + +class LFUPolicy(Policy): + def prioritize_samples(self, samples, auxilliary_info): + return sorted(samples, cmp=lambda e1, e2: e1.value.num_hits - e2.value.num_hits) + + def policy_name(self): + return "lfu" + + +class HyperbolicPolicy(Policy): + """ + An implementation of Hyperbolic caching. + + Aaron Blankstein, Siddhartha Sen, and Michael J. Freedman. 2017. + Hyperbolic caching: flexible caching for web applications. In Proceedings + of the 2017 USENIX Conference on Usenix Annual Technical Conference + (USENIX ATC '17). USENIX Association, Berkeley, CA, USA, 499-511. + """ + + def compare(self, e1, e2, now): + e1_duration = max(0, (now - e1.value.insertion_time) / kMicrosInSecond) * float( + e1.value.value_size + ) + e2_duration = max(0, (now - e2.value.insertion_time) / kMicrosInSecond) * float( + e2.value.value_size + ) + if e1_duration == e2_duration: + return e1.value.num_hits - e2.value.num_hits + if e1_duration == 0: + return 1 + if e2_duration == 0: + return 1 + diff = (float(e1.value.num_hits) / (float(e1_duration))) - ( + float(e2.value.num_hits) / float(e2_duration) + ) + if diff == 0: + return 0 + elif diff > 0: + return 1 + else: + return -1 + + def prioritize_samples(self, samples, auxilliary_info): + assert len(auxilliary_info) == 3 + now = auxilliary_info[0] + return sorted(samples, cmp=lambda e1, e2: self.compare(e1, e2, now)) + + def policy_name(self): + return "hb" + + +class CostClassPolicy(Policy): + """ + We calculate the hit density of a cost class as + number of hits / total size in cache * average duration in the cache. + + An entry has a higher priority if its class's hit density is higher. + """ + + def compare(self, e1, e2, now, cost_classes, cost_class_label): + e1_class = e1.value.cost_class(cost_class_label) + e2_class = e2.value.cost_class(cost_class_label) + + assert e1_class in cost_classes + assert e2_class in cost_classes + + e1_entry = cost_classes[e1_class] + e2_entry = cost_classes[e2_class] + e1_density = e1_entry.density(now) + e2_density = e2_entry.density(now) + e1_hits = cost_classes[e1_class].hits + e2_hits = cost_classes[e2_class].hits + + if e1_density == e2_density: + return e1_hits - e2_hits + + if e1_entry.num_entries_in_cache == 0: + return -1 + if e2_entry.num_entries_in_cache == 0: + return 1 + + if e1_density == 0: + return 1 + if e2_density == 0: + return -1 + diff = (float(e1_hits) / float(e1_density)) - ( + float(e2_hits) / float(e2_density) + ) + if diff == 0: + return 0 + elif diff > 0: + return 1 + else: + return -1 + + def prioritize_samples(self, samples, auxilliary_info): + assert len(auxilliary_info) == 3 + now = auxilliary_info[0] + cost_classes = auxilliary_info[1] + cost_class_label = auxilliary_info[2] + return sorted( + samples, + cmp=lambda e1, e2: self.compare( + e1, e2, now, cost_classes, cost_class_label + ), + ) + + def policy_name(self): + return "cc" + + +class Cache(object): + """ + This is the base class for the implementations of alternative cache + replacement policies. + """ + + def __init__(self, cache_size, enable_cache_row_key): + self.cache_size = cache_size + self.used_size = 0 + self.per_second_miss_ratio_stats = MissRatioStats(1) + self.miss_ratio_stats = MissRatioStats(kSecondsInMinute) + self.per_hour_miss_ratio_stats = MissRatioStats(kSecondsInHour) + # 0: disabled. 1: enabled. Insert both row and the refereneced data block. + # 2: enabled. Insert only the row but NOT the referenced data block. + self.enable_cache_row_key = enable_cache_row_key + self.get_id_row_key_map = {} + self.max_seen_get_id = 0 + self.retain_get_id_range = 100000 + + def block_key(self, trace_record): + return "b{}".format(trace_record.block_id) + + def row_key(self, trace_record): + return "g{}-{}".format(trace_record.fd, trace_record.key_id) + + def _lookup(self, trace_record, key, hash): + """ + Look up the key in the cache. + Returns true upon a cache hit, false otherwise. + """ + raise NotImplementedError + + def _evict(self, trace_record, key, hash, value_size): + """ + Evict entries in the cache until there is enough room to insert the new + entry with 'value_size'. + """ + raise NotImplementedError + + def _insert(self, trace_record, key, hash, value_size): + """ + Insert the new entry into the cache. + """ + raise NotImplementedError + + def _should_admit(self, trace_record, key, hash, value_size): + """ + A custom admission policy to decide whether we should admit the new + entry upon a cache miss. + Returns true if the new entry should be admitted, false otherwise. + """ + raise NotImplementedError + + def cache_name(self): + """ + The name of the replacement policy. + """ + raise NotImplementedError + + def is_ml_cache(self): + return False + + def _update_stats(self, access_time, is_hit, miss_bytes): + self.per_second_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes) + self.miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes) + self.per_hour_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes) + + def access(self, trace_record): + """ + Access a trace record. The simulator calls this function to access a + trace record. + """ + assert self.used_size <= self.cache_size + if ( + self.enable_cache_row_key > 0 + and trace_record.caller == 1 + and trace_record.key_id != 0 + and trace_record.get_id != 0 + ): + # This is a get request. + self._access_row(trace_record) + return + is_hit = self._access_kv( + trace_record, + self.block_key(trace_record), + trace_record.block_id, + trace_record.block_size, + trace_record.no_insert, + ) + self._update_stats( + trace_record.access_time, is_hit=is_hit, miss_bytes=trace_record.block_size + ) + + def _access_row(self, trace_record): + row_key = self.row_key(trace_record) + self.max_seen_get_id = max(self.max_seen_get_id, trace_record.get_id) + self.get_id_row_key_map.pop( + self.max_seen_get_id - self.retain_get_id_range, None + ) + if trace_record.get_id not in self.get_id_row_key_map: + self.get_id_row_key_map[trace_record.get_id] = {} + self.get_id_row_key_map[trace_record.get_id]["h"] = False + if self.get_id_row_key_map[trace_record.get_id]["h"]: + # We treat future accesses as hits since this get request + # completes. + # print("row hit 1") + self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0) + return + if row_key not in self.get_id_row_key_map[trace_record.get_id]: + # First time seen this key. + is_hit = self._access_kv( + trace_record, + key=row_key, + hash=trace_record.key_id, + value_size=trace_record.kv_size, + no_insert=False, + ) + inserted = False + if trace_record.kv_size > 0: + inserted = True + self.get_id_row_key_map[trace_record.get_id][row_key] = inserted + self.get_id_row_key_map[trace_record.get_id]["h"] = is_hit + if self.get_id_row_key_map[trace_record.get_id]["h"]: + # We treat future accesses as hits since this get request + # completes. + # print("row hit 2") + self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0) + return + # Access its blocks. + no_insert = trace_record.no_insert + if ( + self.enable_cache_row_key == 2 + and trace_record.kv_size > 0 + and trace_record.block_type == 9 + ): + no_insert = True + is_hit = self._access_kv( + trace_record, + key=self.block_key(trace_record), + hash=trace_record.block_id, + value_size=trace_record.block_size, + no_insert=no_insert, + ) + self._update_stats( + trace_record.access_time, is_hit, miss_bytes=trace_record.block_size + ) + if ( + trace_record.kv_size > 0 + and not self.get_id_row_key_map[trace_record.get_id][row_key] + ): + # Insert the row key-value pair. + self._access_kv( + trace_record, + key=row_key, + hash=trace_record.key_id, + value_size=trace_record.kv_size, + no_insert=False, + ) + # Mark as inserted. + self.get_id_row_key_map[trace_record.get_id][row_key] = True + + def _access_kv(self, trace_record, key, hash, value_size, no_insert): + # Sanity checks. + assert self.used_size <= self.cache_size + if self._lookup(trace_record, key, hash): + # A cache hit. + return True + if no_insert or value_size <= 0: + return False + # A cache miss. + if value_size > self.cache_size: + # The block is too large to fit into the cache. + return False + self._evict(trace_record, key, hash, value_size) + if self._should_admit(trace_record, key, hash, value_size): + self._insert(trace_record, key, hash, value_size) + self.used_size += value_size + return False + + +class CostClassEntry: + """ + A cost class maintains aggregated statistics of cached entries in a class. + For example, we may define block type as a class. Then, cached blocks of the + same type will share one cost class entry. + """ + + def __init__(self): + self.hits = 0 + self.num_entries_in_cache = 0 + self.size_in_cache = 0 + self.sum_insertion_times = 0 + self.sum_last_access_time = 0 + + def insert(self, trace_record, key, value_size): + self.size_in_cache += value_size + self.num_entries_in_cache += 1 + self.sum_insertion_times += trace_record.access_time / kMicrosInSecond + self.sum_last_access_time += trace_record.access_time / kMicrosInSecond + + def remove(self, insertion_time, last_access_time, key, value_size, num_hits): + self.hits -= num_hits + self.num_entries_in_cache -= 1 + self.sum_insertion_times -= insertion_time / kMicrosInSecond + self.size_in_cache -= value_size + self.sum_last_access_time -= last_access_time / kMicrosInSecond + + def update_on_hit(self, trace_record, last_access_time): + self.hits += 1 + self.sum_last_access_time -= last_access_time / kMicrosInSecond + self.sum_last_access_time += trace_record.access_time / kMicrosInSecond + + def avg_lifetime_in_cache(self, now): + avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache + return now / kMicrosInSecond - avg_insertion_time + + def avg_last_access_time(self): + if self.num_entries_in_cache == 0: + return 0 + return float(self.sum_last_access_time) / float(self.num_entries_in_cache) + + def avg_size(self): + if self.num_entries_in_cache == 0: + return 0 + return float(self.sum_last_access_time) / float(self.num_entries_in_cache) + + def density(self, now): + avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache + in_cache_duration = now / kMicrosInSecond - avg_insertion_time + return self.size_in_cache * in_cache_duration + + +class MLCache(Cache): + """ + MLCache is the base class for implementations of alternative replacement + policies using reinforcement learning. + """ + + def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label): + super(MLCache, self).__init__(cache_size, enable_cache_row_key) + self.table = HashTable() + self.policy_stats = PolicyStats(kSecondsInMinute, policies) + self.per_hour_policy_stats = PolicyStats(kSecondsInHour, policies) + self.policies = policies + self.cost_classes = {} + self.cost_class_label = cost_class_label + + def is_ml_cache(self): + return True + + def _lookup(self, trace_record, key, hash): + value = self.table.lookup(key, hash) + if value is not None: + # Update the entry's cost class statistics. + if self.cost_class_label is not None: + cost_class = value.cost_class(self.cost_class_label) + assert cost_class in self.cost_classes + self.cost_classes[cost_class].update_on_hit( + trace_record, value.last_access_time + ) + # Update the entry's last access time. + self.table.insert( + key, + hash, + CacheEntry( + value_size=value.value_size, + cf_id=value.cf_id, + level=value.level, + block_type=value.block_type, + table_id=value.table_id, + access_number=self.miss_ratio_stats.num_accesses, + time_s=trace_record.access_time, + num_hits=value.num_hits + 1, + ), + ) + return True + return False + + def _evict(self, trace_record, key, hash, value_size): + # Select a policy, random sample kSampleSize keys from the cache, then + # evict keys in the sample set until we have enough room for the new + # entry. + policy_index = self._select_policy(trace_record, key) + assert policy_index < len(self.policies) and policy_index >= 0 + self.policies[policy_index].delete(key) + self.policy_stats.update_metrics(trace_record.access_time, policy_index) + self.per_hour_policy_stats.update_metrics( + trace_record.access_time, policy_index + ) + while self.used_size + value_size > self.cache_size: + # Randomly sample n entries. + samples = self.table.random_sample(kSampleSize) + samples = self.policies[policy_index].prioritize_samples( + samples, + [trace_record.access_time, self.cost_classes, self.cost_class_label], + ) + for hash_entry in samples: + assert self.table.delete(hash_entry.key, hash_entry.hash) is not None + self.used_size -= hash_entry.value.value_size + self.policies[policy_index].evict( + key=hash_entry.key, max_size=self.table.elements + ) + # Update the entry's cost class statistics. + if self.cost_class_label is not None: + cost_class = hash_entry.value.cost_class(self.cost_class_label) + assert cost_class in self.cost_classes + self.cost_classes[cost_class].remove( + hash_entry.value.insertion_time, + hash_entry.value.last_access_time, + key, + hash_entry.value.value_size, + hash_entry.value.num_hits, + ) + if self.used_size + value_size <= self.cache_size: + break + + def _insert(self, trace_record, key, hash, value_size): + assert self.used_size + value_size <= self.cache_size + entry = CacheEntry( + value_size, + trace_record.cf_id, + trace_record.level, + trace_record.block_type, + trace_record.table_id, + self.miss_ratio_stats.num_accesses, + trace_record.access_time, + ) + # Update the entry's cost class statistics. + if self.cost_class_label is not None: + cost_class = entry.cost_class(self.cost_class_label) + if cost_class not in self.cost_classes: + self.cost_classes[cost_class] = CostClassEntry() + self.cost_classes[cost_class].insert(trace_record, key, value_size) + self.table.insert(key, hash, entry) + + def _should_admit(self, trace_record, key, hash, value_size): + return True + + def _select_policy(self, trace_record, key): + raise NotImplementedError + + +class ThompsonSamplingCache(MLCache): + """ + An implementation of Thompson Sampling for the Bernoulli Bandit. + + Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband, + and Zheng Wen. 2018. A Tutorial on Thompson Sampling. Found. + Trends Mach. Learn. 11, 1 (July 2018), 1-96. + DOI: https://doi.org/10.1561/2200000070 + """ + + def __init__( + self, + cache_size, + enable_cache_row_key, + policies, + cost_class_label, + init_a=1, + init_b=1, + ): + super(ThompsonSamplingCache, self).__init__( + cache_size, enable_cache_row_key, policies, cost_class_label + ) + self._as = {} + self._bs = {} + for _i in range(len(policies)): + self._as = [init_a] * len(self.policies) + self._bs = [init_b] * len(self.policies) + + def _select_policy(self, trace_record, key): + if len(self.policies) == 1: + return 0 + samples = [ + np.random.beta(self._as[x], self._bs[x]) for x in range(len(self.policies)) + ] + selected_policy = max(range(len(self.policies)), key=lambda x: samples[x]) + reward = self.policies[selected_policy].generate_reward(key) + assert reward <= 1 and reward >= 0 + self._as[selected_policy] += reward + self._bs[selected_policy] += 1 - reward + return selected_policy + + def cache_name(self): + if self.enable_cache_row_key: + return "Hybrid ThompsonSampling with cost class {} (ts_hybrid)".format( + self.cost_class_label + ) + return "ThompsonSampling with cost class {} (ts)".format(self.cost_class_label) + + +class LinUCBCache(MLCache): + """ + An implementation of LinUCB with disjoint linear models. + + Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010. + A contextual-bandit approach to personalized news article recommendation. + In Proceedings of the 19th international conference on World wide web + (WWW '10). ACM, New York, NY, USA, 661-670. + DOI=http://dx.doi.org/10.1145/1772690.1772758 + """ + + def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label): + super(LinUCBCache, self).__init__( + cache_size, enable_cache_row_key, policies, cost_class_label + ) + self.nfeatures = 4 # Block type, level, cf. + self.th = np.zeros((len(self.policies), self.nfeatures)) + self.eps = 0.2 + self.b = np.zeros_like(self.th) + self.A = np.zeros((len(self.policies), self.nfeatures, self.nfeatures)) + self.A_inv = np.zeros((len(self.policies), self.nfeatures, self.nfeatures)) + for i in range(len(self.policies)): + self.A[i] = np.identity(self.nfeatures) + self.th_hat = np.zeros_like(self.th) + self.p = np.zeros(len(self.policies)) + self.alph = 0.2 + + def _select_policy(self, trace_record, key): + if len(self.policies) == 1: + return 0 + x_i = np.zeros(self.nfeatures) # The current context vector + x_i[0] = trace_record.block_type + x_i[1] = trace_record.level + x_i[2] = trace_record.cf_id + p = np.zeros(len(self.policies)) + for a in range(len(self.policies)): + self.th_hat[a] = self.A_inv[a].dot(self.b[a]) + ta = x_i.dot(self.A_inv[a]).dot(x_i) + a_upper_ci = self.alph * np.sqrt(ta) + a_mean = self.th_hat[a].dot(x_i) + p[a] = a_mean + a_upper_ci + p = p + (np.random.random(len(p)) * 0.000001) + selected_policy = p.argmax() + reward = self.policies[selected_policy].generate_reward(key) + assert reward <= 1 and reward >= 0 + self.A[selected_policy] += np.outer(x_i, x_i) + self.b[selected_policy] += reward * x_i + self.A_inv[selected_policy] = np.linalg.inv(self.A[selected_policy]) + del x_i + return selected_policy + + def cache_name(self): + if self.enable_cache_row_key: + return "Hybrid LinUCB with cost class {} (linucb_hybrid)".format( + self.cost_class_label + ) + return "LinUCB with cost class {} (linucb)".format(self.cost_class_label) + + +class OPTCacheEntry: + """ + A cache entry for the OPT algorithm. The entries are sorted based on its + next access sequence number in reverse order, i.e., the entry which next + access is the furthest in the future is ordered before other entries. + """ + + def __init__(self, key, next_access_seq_no, value_size): + self.key = key + self.next_access_seq_no = next_access_seq_no + self.value_size = value_size + self.is_removed = False + + def __cmp__(self, other): + if other.next_access_seq_no != self.next_access_seq_no: + return other.next_access_seq_no - self.next_access_seq_no + return self.value_size - other.value_size + + def __repr__(self): + return "({} {} {} {})".format( + self.key, self.next_access_seq_no, self.value_size, self.is_removed + ) + + +class PQTable: + """ + A hash table with a priority queue. + """ + + def __init__(self): + # A list of entries arranged in a heap sorted based on the entry custom + # implementation of __cmp__ + self.pq = [] + self.table = {} + + def pqinsert(self, entry): + "Add a new key or update the priority of an existing key" + # Remove the entry from the table first. + removed_entry = self.table.pop(entry.key, None) + if removed_entry: + # Mark as removed since there is no 'remove' API in heappq. + # Instead, an entry in pq is removed lazily when calling pop. + removed_entry.is_removed = True + self.table[entry.key] = entry + heapq.heappush(self.pq, entry) + return removed_entry + + def pqpop(self): + while self.pq: + entry = heapq.heappop(self.pq) + if not entry.is_removed: + del self.table[entry.key] + return entry + return None + + def pqpeek(self): + while self.pq: + entry = self.pq[0] + if not entry.is_removed: + return entry + heapq.heappop(self.pq) + return + + def __contains__(self, k): + return k in self.table + + def __getitem__(self, k): + return self.table[k] + + def __len__(self): + return len(self.table) + + def values(self): + return self.table.values() + + +class OPTCache(Cache): + """ + An implementation of the Belady MIN algorithm. OPTCache evicts an entry + in the cache whose next access occurs furthest in the future. + + Note that Belady MIN algorithm is optimal assuming all blocks having the + same size and a missing entry will be inserted in the cache. + These are NOT true for the block cache trace since blocks have different + sizes and we may not insert a block into the cache upon a cache miss. + However, it is still useful to serve as a "theoretical upper bound" on the + lowest miss ratio we can achieve given a cache size. + + L. A. Belady. 1966. A Study of Replacement Algorithms for a + Virtual-storage Computer. IBM Syst. J. 5, 2 (June 1966), 78-101. + DOI=http://dx.doi.org/10.1147/sj.52.0078 + """ + + def __init__(self, cache_size): + super(OPTCache, self).__init__(cache_size, enable_cache_row_key=0) + self.table = PQTable() + + def _lookup(self, trace_record, key, hash): + if key not in self.table: + return False + # A cache hit. Update its next access time. + assert ( + self.table.pqinsert( + OPTCacheEntry( + key, trace_record.next_access_seq_no, self.table[key].value_size + ) + ) + is not None + ) + return True + + def _evict(self, trace_record, key, hash, value_size): + while self.used_size + value_size > self.cache_size: + evict_entry = self.table.pqpop() + assert evict_entry is not None + self.used_size -= evict_entry.value_size + + def _insert(self, trace_record, key, hash, value_size): + assert ( + self.table.pqinsert( + OPTCacheEntry(key, trace_record.next_access_seq_no, value_size) + ) + is None + ) + + def _should_admit(self, trace_record, key, hash, value_size): + return True + + def cache_name(self): + return "Belady MIN (opt)" + + +class GDSizeEntry: + """ + A cache entry for the greedy dual size replacement policy. + """ + + def __init__(self, key, value_size, priority): + self.key = key + self.value_size = value_size + self.priority = priority + self.is_removed = False + + def __cmp__(self, other): + if other.priority != self.priority: + return self.priority - other.priority + return self.value_size - other.value_size + + def __repr__(self): + return "({} {} {} {})".format( + self.key, self.next_access_seq_no, self.value_size, self.is_removed + ) + + +class GDSizeCache(Cache): + """ + An implementation of the greedy dual size algorithm. + We define cost as an entry's size. + + See https://www.usenix.org/legacy/publications/library/proceedings/usits97/full_papers/cao/cao_html/node8.html + and N. Young. The k-server dual and loose competitiveness for paging. + Algorithmica,June 1994, vol. 11,(no.6):525-41. + Rewritten version of ''On-line caching as cache size varies'', + in The 2nd Annual ACM-SIAM Symposium on Discrete Algorithms, 241-250, 1991. + """ + + def __init__(self, cache_size, enable_cache_row_key): + super(GDSizeCache, self).__init__(cache_size, enable_cache_row_key) + self.table = PQTable() + self.L = 0.0 + + def cache_name(self): + if self.enable_cache_row_key: + return "Hybrid GreedyDualSize (gdsize_hybrid)" + return "GreedyDualSize (gdsize)" + + def _lookup(self, trace_record, key, hash): + if key not in self.table: + return False + # A cache hit. Update its priority. + entry = self.table[key] + assert ( + self.table.pqinsert( + GDSizeEntry(key, entry.value_size, self.L + entry.value_size) + ) + is not None + ) + return True + + def _evict(self, trace_record, key, hash, value_size): + while self.used_size + value_size > self.cache_size: + evict_entry = self.table.pqpop() + assert evict_entry is not None + self.L = evict_entry.priority + self.used_size -= evict_entry.value_size + + def _insert(self, trace_record, key, hash, value_size): + assert ( + self.table.pqinsert(GDSizeEntry(key, value_size, self.L + value_size)) + is None + ) + + def _should_admit(self, trace_record, key, hash, value_size): + return True + + +class Deque(object): + """A Deque class facilitates the implementation of LRU and ARC.""" + + def __init__(self): + self.od = OrderedDict() + + def appendleft(self, k): + if k in self.od: + del self.od[k] + self.od[k] = None + + def pop(self): + item = self.od.popitem(last=False) if self.od else None + if item is not None: + return item[0] + return None + + def remove(self, k): + del self.od[k] + + def __len__(self): + return len(self.od) + + def __contains__(self, k): + return k in self.od + + def __iter__(self): + return reversed(self.od) + + def __repr__(self): + return "Deque(%r)" % (list(self),) + + +class ARCCache(Cache): + """ + An implementation of ARC. ARC assumes that all blocks are having the + same size. The size of index and filter blocks are variable. To accommodate + this, we modified ARC as follows: + 1) We use 16 KB as the average block size and calculate the number of blocks + (c) in the cache. + 2) When we insert an entry, the cache evicts entries in both t1 and t2 + queues until it has enough space for the new entry. This also requires + modification of the algorithm to maintain a maximum of 2*c blocks. + + Nimrod Megiddo and Dharmendra S. Modha. 2003. ARC: A Self-Tuning, Low + Overhead Replacement Cache. In Proceedings of the 2nd USENIX Conference on + File and Storage Technologies (FAST '03). USENIX Association, Berkeley, CA, + USA, 115-130. + """ + + def __init__(self, cache_size, enable_cache_row_key): + super(ARCCache, self).__init__(cache_size, enable_cache_row_key) + self.table = {} + self.c = cache_size / 16 * 1024 # Number of elements in the cache. + self.p = 0 # Target size for the list T1 + # L1: only once recently + self.t1 = Deque() # T1: recent cache entries + self.b1 = Deque() # B1: ghost entries recently evicted from the T1 cache + # L2: at least twice recently + self.t2 = Deque() # T2: frequent entries + self.b2 = Deque() # B2: ghost entries recently evicted from the T2 cache + + def _replace(self, key, value_size): + while self.used_size + value_size > self.cache_size: + if self.t1 and ((key in self.b2) or (len(self.t1) > self.p)): + old = self.t1.pop() + self.b1.appendleft(old) + else: + if self.t2: + old = self.t2.pop() + self.b2.appendleft(old) + else: + old = self.t1.pop() + self.b1.appendleft(old) + self.used_size -= self.table[old].value_size + del self.table[old] + + def _lookup(self, trace_record, key, hash): + # Case I: key is in T1 or T2. + # Move key to MRU position in T2. + if key in self.t1: + self.t1.remove(key) + self.t2.appendleft(key) + return True + + if key in self.t2: + self.t2.remove(key) + self.t2.appendleft(key) + return True + return False + + def _evict(self, trace_record, key, hash, value_size): + # Case II: key is in B1 + # Move x from B1 to the MRU position in T2 (also fetch x to the cache). + if key in self.b1: + self.p = min(self.c, self.p + max(len(self.b2) / len(self.b1), 1)) + self._replace(key, value_size) + self.b1.remove(key) + self.t2.appendleft(key) + return + + # Case III: key is in B2 + # Move x from B2 to the MRU position in T2 (also fetch x to the cache). + if key in self.b2: + self.p = max(0, self.p - max(len(self.b1) / len(self.b2), 1)) + self._replace(key, value_size) + self.b2.remove(key) + self.t2.appendleft(key) + return + + # Case IV: key is not in (T1 u B1 u T2 u B2) + self._replace(key, value_size) + while len(self.t1) + len(self.b1) >= self.c and self.b1: + self.b1.pop() + + total = len(self.t1) + len(self.b1) + len(self.t2) + len(self.b2) + while total >= (2 * self.c) and self.b2: + self.b2.pop() + total -= 1 + # Finally, move it to MRU position in T1. + self.t1.appendleft(key) + return + + def _insert(self, trace_record, key, hash, value_size): + self.table[key] = CacheEntry( + value_size, + trace_record.cf_id, + trace_record.level, + trace_record.block_type, + trace_record.table_id, + 0, + trace_record.access_time, + ) + + def _should_admit(self, trace_record, key, hash, value_size): + return True + + def cache_name(self): + if self.enable_cache_row_key: + return "Hybrid Adaptive Replacement Cache (arc_hybrid)" + return "Adaptive Replacement Cache (arc)" + + +class LRUCache(Cache): + """ + A strict LRU queue. + """ + + def __init__(self, cache_size, enable_cache_row_key): + super(LRUCache, self).__init__(cache_size, enable_cache_row_key) + self.table = {} + self.lru = Deque() + + def cache_name(self): + if self.enable_cache_row_key: + return "Hybrid LRU (lru_hybrid)" + return "LRU (lru)" + + def _lookup(self, trace_record, key, hash): + if key not in self.table: + return False + # A cache hit. Update LRU queue. + self.lru.remove(key) + self.lru.appendleft(key) + return True + + def _evict(self, trace_record, key, hash, value_size): + while self.used_size + value_size > self.cache_size: + evict_key = self.lru.pop() + self.used_size -= self.table[evict_key].value_size + del self.table[evict_key] + + def _insert(self, trace_record, key, hash, value_size): + self.table[key] = CacheEntry( + value_size, + trace_record.cf_id, + trace_record.level, + trace_record.block_type, + trace_record.table_id, + 0, + trace_record.access_time, + ) + self.lru.appendleft(key) + + def _should_admit(self, trace_record, key, hash, value_size): + return True + + +class TraceCache(Cache): + """ + A trace cache. Lookup returns true if the trace observes a cache hit. + It is used to maintain cache hits observed in the trace. + """ + + def __init__(self, cache_size): + super(TraceCache, self).__init__(cache_size, enable_cache_row_key=0) + + def _lookup(self, trace_record, key, hash): + return trace_record.is_hit + + def _evict(self, trace_record, key, hash, value_size): + pass + + def _insert(self, trace_record, key, hash, value_size): + pass + + def _should_admit(self, trace_record, key, hash, value_size): + return False + + def cache_name(self): + return "Trace" + + +def parse_cache_size(cs): + cs = cs.replace("\n", "") + if cs[-1] == "M": + return int(cs[: len(cs) - 1]) * 1024 * 1024 + if cs[-1] == "G": + return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024 + if cs[-1] == "T": + return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024 * 1024 + return int(cs) + + +def create_cache(cache_type, cache_size, downsample_size): + cache_size = cache_size / downsample_size + enable_cache_row_key = 0 + if "hybridn" in cache_type: + enable_cache_row_key = 2 + cache_type = cache_type[:-8] + if "hybrid" in cache_type: + enable_cache_row_key = 1 + cache_type = cache_type[:-7] + if cache_type == "ts": + return ThompsonSamplingCache( + cache_size, + enable_cache_row_key, + [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()], + cost_class_label=None, + ) + elif cache_type == "linucb": + return LinUCBCache( + cache_size, + enable_cache_row_key, + [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()], + cost_class_label=None, + ) + elif cache_type == "pylru": + return ThompsonSamplingCache( + cache_size, enable_cache_row_key, [LRUPolicy()], cost_class_label=None + ) + elif cache_type == "pymru": + return ThompsonSamplingCache( + cache_size, enable_cache_row_key, [MRUPolicy()], cost_class_label=None + ) + elif cache_type == "pylfu": + return ThompsonSamplingCache( + cache_size, enable_cache_row_key, [LFUPolicy()], cost_class_label=None + ) + elif cache_type == "pyhb": + return ThompsonSamplingCache( + cache_size, + enable_cache_row_key, + [HyperbolicPolicy()], + cost_class_label=None, + ) + elif cache_type == "pycctbbt": + return ThompsonSamplingCache( + cache_size, + enable_cache_row_key, + [CostClassPolicy()], + cost_class_label="table_bt", + ) + elif cache_type == "pycccf": + return ThompsonSamplingCache( + cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="cf" + ) + elif cache_type == "pycctblevelbt": + return ThompsonSamplingCache( + cache_size, + enable_cache_row_key, + [CostClassPolicy()], + cost_class_label="table_level_bt", + ) + elif cache_type == "pycccfbt": + return ThompsonSamplingCache( + cache_size, + enable_cache_row_key, + [CostClassPolicy()], + cost_class_label="cf_bt", + ) + elif cache_type == "pycctb": + return ThompsonSamplingCache( + cache_size, + enable_cache_row_key, + [CostClassPolicy()], + cost_class_label="table", + ) + elif cache_type == "pyccbt": + return ThompsonSamplingCache( + cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="bt" + ) + elif cache_type == "opt": + if enable_cache_row_key: + print("opt does not support hybrid mode.") + assert False + return OPTCache(cache_size) + elif cache_type == "trace": + if enable_cache_row_key: + print("trace does not support hybrid mode.") + assert False + return TraceCache(cache_size) + elif cache_type == "lru": + return LRUCache(cache_size, enable_cache_row_key) + elif cache_type == "arc": + return ARCCache(cache_size, enable_cache_row_key) + elif cache_type == "gdsize": + return GDSizeCache(cache_size, enable_cache_row_key) + else: + print("Unknown cache type {}".format(cache_type)) + assert False + return None + + +class BlockAccessTimeline: + """ + BlockAccessTimeline stores all accesses of a block. + """ + + def __init__(self): + self.accesses = [] + self.current_access_index = 1 + + def get_next_access(self): + if self.current_access_index == len(self.accesses): + return sys.maxsize + next_access_seq_no = self.accesses[self.current_access_index] + self.current_access_index += 1 + return next_access_seq_no + + +def percent(e1, e2): + if e2 == 0: + return -1 + return float(e1) * 100.0 / float(e2) + + +def is_target_cf(access_cf, target_cf_name): + if target_cf_name == "all": + return True + return access_cf == target_cf_name + + +def run( + trace_file_path, + cache_type, + cache, + warmup_seconds, + max_accesses_to_process, + target_cf_name, +): + warmup_complete = False + trace_miss_ratio_stats = MissRatioStats(kSecondsInMinute) + access_seq_no = 0 + time_interval = 1 + start_time = time.time() + trace_start_time = 0 + trace_duration = 0 + is_opt_cache = False + if cache.cache_name() == "Belady MIN (opt)": + is_opt_cache = True + + block_access_timelines = {} + num_no_inserts = 0 + num_blocks_with_no_size = 0 + num_inserts_block_with_no_size = 0 + + if is_opt_cache: + # Read all blocks in memory and stores their access times so that OPT + # can use this information to evict the cached key which next access is + # the furthest in the future. + print("Preprocessing block traces.") + with open(trace_file_path, "r") as trace_file: + for line in trace_file: + if ( + max_accesses_to_process != -1 + and access_seq_no > max_accesses_to_process + ): + break + ts = line.split(",") + timestamp = int(ts[0]) + cf_name = ts[5] + if not is_target_cf(cf_name, target_cf_name): + continue + if trace_start_time == 0: + trace_start_time = timestamp + trace_duration = timestamp - trace_start_time + block_id = int(ts[1]) + block_size = int(ts[3]) + no_insert = int(ts[9]) + if block_id not in block_access_timelines: + block_access_timelines[block_id] = BlockAccessTimeline() + if block_size == 0: + num_blocks_with_no_size += 1 + block_access_timelines[block_id].accesses.append(access_seq_no) + access_seq_no += 1 + if no_insert == 1: + num_no_inserts += 1 + if no_insert == 0 and block_size == 0: + num_inserts_block_with_no_size += 1 + if access_seq_no % 100 != 0: + continue + now = time.time() + if now - start_time > time_interval * 10: + print( + "Take {} seconds to process {} trace records with trace " + "duration of {} seconds. Throughput: {} records/second.".format( + now - start_time, + access_seq_no, + trace_duration / 1000000, + access_seq_no / (now - start_time), + ) + ) + time_interval += 1 + print( + "Trace contains {0} blocks, {1}({2:.2f}%) blocks with no size." + "{3} accesses, {4}({5:.2f}%) accesses with no_insert," + "{6}({7:.2f}%) accesses that want to insert but block size is 0.".format( + len(block_access_timelines), + num_blocks_with_no_size, + percent(num_blocks_with_no_size, len(block_access_timelines)), + access_seq_no, + num_no_inserts, + percent(num_no_inserts, access_seq_no), + num_inserts_block_with_no_size, + percent(num_inserts_block_with_no_size, access_seq_no), + ) + ) + + access_seq_no = 0 + time_interval = 1 + start_time = time.time() + trace_start_time = 0 + trace_duration = 0 + print("Running simulated {} cache on block traces.".format(cache.cache_name())) + with open(trace_file_path, "r") as trace_file: + for line in trace_file: + if ( + max_accesses_to_process != -1 + and access_seq_no > max_accesses_to_process + ): + break + if access_seq_no % 1000000 == 0: + # Force a python gc periodically to reduce memory usage. + gc.collect() + ts = line.split(",") + timestamp = int(ts[0]) + cf_name = ts[5] + if not is_target_cf(cf_name, target_cf_name): + continue + if trace_start_time == 0: + trace_start_time = timestamp + trace_duration = timestamp - trace_start_time + if ( + not warmup_complete + and warmup_seconds > 0 + and trace_duration > warmup_seconds * 1000000 + ): + cache.miss_ratio_stats.reset_counter() + warmup_complete = True + next_access_seq_no = 0 + block_id = int(ts[1]) + if is_opt_cache: + next_access_seq_no = block_access_timelines[block_id].get_next_access() + record = TraceRecord( + access_time=int(ts[0]), + block_id=int(ts[1]), + block_type=int(ts[2]), + block_size=int(ts[3]), + cf_id=int(ts[4]), + cf_name=ts[5], + level=int(ts[6]), + fd=int(ts[7]), + caller=int(ts[8]), + no_insert=int(ts[9]), + get_id=int(ts[10]), + key_id=int(ts[11]), + kv_size=int(ts[12]), + is_hit=int(ts[13]), + referenced_key_exist_in_block=int(ts[14]), + num_keys_in_block=int(ts[15]), + table_id=int(ts[16]), + seq_number=int(ts[17]), + block_key_size=int(ts[18]), + key_size=int(ts[19]), + block_offset_in_file=int(ts[20]), + next_access_seq_no=next_access_seq_no, + ) + trace_miss_ratio_stats.update_metrics( + record.access_time, is_hit=record.is_hit, miss_bytes=record.block_size + ) + cache.access(record) + access_seq_no += 1 + del record + del ts + if access_seq_no % 100 != 0: + continue + # Report progress every 10 seconds. + now = time.time() + if now - start_time > time_interval * 10: + print( + "Take {} seconds to process {} trace records with trace " + "duration of {} seconds. Throughput: {} records/second. " + "Trace miss ratio {}".format( + now - start_time, + access_seq_no, + trace_duration / 1000000, + access_seq_no / (now - start_time), + trace_miss_ratio_stats.miss_ratio(), + ) + ) + time_interval += 1 + print( + "{},0,0,{},{},{}".format( + cache_type, + cache.cache_size, + cache.miss_ratio_stats.miss_ratio(), + cache.miss_ratio_stats.num_accesses, + ) + ) + now = time.time() + print( + "Take {} seconds to process {} trace records with trace duration of {} " + "seconds. Throughput: {} records/second. Trace miss ratio {}".format( + now - start_time, + access_seq_no, + trace_duration / 1000000, + access_seq_no / (now - start_time), + trace_miss_ratio_stats.miss_ratio(), + ) + ) + print( + "{},0,0,{},{},{}".format( + cache_type, + cache.cache_size, + cache.miss_ratio_stats.miss_ratio(), + cache.miss_ratio_stats.num_accesses, + ) + ) + return trace_start_time, trace_duration + + +def report_stats( + cache, + cache_type, + cache_size, + target_cf_name, + result_dir, + trace_start_time, + trace_end_time, +): + cache_label = "{}-{}-{}".format(cache_type, cache_size, target_cf_name) + with open("{}/data-ml-mrc-{}".format(result_dir, cache_label), "w+") as mrc_file: + mrc_file.write( + "{},0,0,{},{},{}\n".format( + cache_type, + cache_size, + cache.miss_ratio_stats.miss_ratio(), + cache.miss_ratio_stats.num_accesses, + ) + ) + + cache_stats = [ + cache.per_second_miss_ratio_stats, + cache.miss_ratio_stats, + cache.per_hour_miss_ratio_stats, + ] + for i in range(len(cache_stats)): + avg_miss_bytes, p95_miss_bytes = cache_stats[i].compute_miss_bytes() + + with open( + "{}/data-ml-avgmb-{}-{}".format( + result_dir, cache_stats[i].time_unit, cache_label + ), + "w+", + ) as mb_file: + mb_file.write( + "{},0,0,{},{}\n".format(cache_type, cache_size, avg_miss_bytes) + ) + + with open( + "{}/data-ml-p95mb-{}-{}".format( + result_dir, cache_stats[i].time_unit, cache_label + ), + "w+", + ) as mb_file: + mb_file.write( + "{},0,0,{},{}\n".format(cache_type, cache_size, p95_miss_bytes) + ) + + cache_stats[i].write_miss_timeline( + cache_type, + cache_size, + target_cf_name, + result_dir, + trace_start_time, + trace_end_time, + ) + cache_stats[i].write_miss_ratio_timeline( + cache_type, + cache_size, + target_cf_name, + result_dir, + trace_start_time, + trace_end_time, + ) + + if not cache.is_ml_cache(): + return + + policy_stats = [cache.policy_stats, cache.per_hour_policy_stats] + for i in range(len(policy_stats)): + policy_stats[i].write_policy_timeline( + cache_type, + cache_size, + target_cf_name, + result_dir, + trace_start_time, + trace_end_time, + ) + policy_stats[i].write_policy_ratio_timeline( + cache_type, + cache_size, + target_cf_name, + result_dir, + trace_start_time, + trace_end_time, + ) + + +if __name__ == "__main__": + if len(sys.argv) <= 8: + print( + "Must provide 8 arguments.\n" + "1) Cache type (ts, linucb, arc, lru, opt, pylru, pymru, pylfu, " + "pyhb, gdsize, trace). One may evaluate the hybrid row_block cache " + "by appending '_hybrid' to a cache_type, e.g., ts_hybrid. " + "Note that hybrid is not supported with opt and trace. \n" + "2) Cache size (xM, xG, xT).\n" + "3) The sampling frequency used to collect the trace. (The " + "simulation scales down the cache size by the sampling frequency).\n" + "4) Warmup seconds (The number of seconds used for warmup).\n" + "5) Trace file path.\n" + "6) Result directory (A directory that saves generated results)\n" + "7) Max number of accesses to process\n" + "8) The target column family. (The simulation will only run " + "accesses on the target column family. If it is set to all, " + "it will run against all accesses.)" + ) + exit(1) + print("Arguments: {}".format(sys.argv)) + cache_type = sys.argv[1] + cache_size = parse_cache_size(sys.argv[2]) + downsample_size = int(sys.argv[3]) + warmup_seconds = int(sys.argv[4]) + trace_file_path = sys.argv[5] + result_dir = sys.argv[6] + max_accesses_to_process = int(sys.argv[7]) + target_cf_name = sys.argv[8] + cache = create_cache(cache_type, cache_size, downsample_size) + trace_start_time, trace_duration = run( + trace_file_path, + cache_type, + cache, + warmup_seconds, + max_accesses_to_process, + target_cf_name, + ) + trace_end_time = trace_start_time + trace_duration + report_stats( + cache, + cache_type, + cache_size, + target_cf_name, + result_dir, + trace_start_time, + trace_end_time, + ) diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh new file mode 100644 index 000000000..295f734aa --- /dev/null +++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh @@ -0,0 +1,156 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# A shell script to run a batch of pysims and combine individual pysim output files. +# +# Usage: bash block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs +# trace_file_path: The file path that stores the traces. +# result_dir: The directory to store pysim results. The output files from a pysim is stores in result_dir/ml +# downsample_size: The downsample size used to collect the trace. +# warmup_seconds: The number of seconds used for warmup. +# max_jobs: The max number of concurrent pysims to run. + +# Install required packages to run simulations. +# sudo dnf install -y numpy scipy python-matplotlib ipython python-pandas sympy python-nose atlas-devel +ulimit -c 0 + +if [ $# -ne 5 ]; then + echo "Usage: ./block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs" + exit 0 +fi + +trace_file="$1" +result_dir="$2" +downsample_size="$3" +warmup_seconds="$4" +max_jobs="$5" +max_num_accesses=100000000 +current_jobs=1 + +ml_tmp_result_dir="$result_dir/ml" +rm -rf "$ml_tmp_result_dir" +mkdir -p "$result_dir" +mkdir -p "$ml_tmp_result_dir" + +# Report miss ratio in the trace. +current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep) +for cf_name in "all" +do +for cache_size in "1G" "2G" "4G" "8G" "16G" #"12G" "16G" "1T" +do +for cache_type in "opt" "lru" "pylru" "pycctbbt" "pyhb" "ts" "trace" "lru_hybrid" #"pycctblevelbt" #"lru_hybridn" "opt" #"pylru" "pylru_hybrid" "pycctbbt" "pycccfbt" "trace" +do + if [[ $cache_type == "trace" && $cache_size != "16G" ]]; then + # We only need to collect miss ratios observed in the trace once. + continue + fi + while [ "$current_jobs" -ge "$max_jobs" ] + do + sleep 10 + echo "Waiting jobs to complete. Number of running jobs: $current_jobs" + current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep) + echo "Waiting jobs to complete. Number of running jobs: $current_jobs" + done + output="log-ml-$cache_type-$cache_size-$cf_name" + echo "Running simulation for $cache_type, cache size $cache_size, and cf_name $cf_name. Number of running jobs: $current_jobs. " + nohup python block_cache_pysim.py "$cache_type" "$cache_size" "$downsample_size" "$warmup_seconds" "$trace_file" "$ml_tmp_result_dir" "$max_num_accesses" "$cf_name" >& "$ml_tmp_result_dir/$output" & + current_jobs=$((current_jobs+1)) +done +done +done + +# Wait for all jobs to complete. +while [ $current_jobs -gt 0 ] +do + sleep 10 + echo "Waiting jobs to complete. Number of running jobs: $current_jobs" + current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep) + echo "Waiting jobs to complete. Number of running jobs: $current_jobs" +done + +echo "Combine individual pysim output files" + +rm -rf "$result_dir/ml_*" +for header in "header-" "data-" +do +for fn in "$ml_tmp_result_dir"/* +do + sum_file="" + time_unit="" + capacity="" + target_cf_name="" + if [[ $fn == *"timeline"* ]]; then + tmpfn="$fn" + IFS='-' read -ra elements <<< "$tmpfn" + time_unit_index=0 + capacity_index=0 + for i in "${elements[@]}" + do + if [[ $i == "timeline" ]]; then + break + fi + time_unit_index=$((time_unit_index+1)) + done + time_unit_index=$((time_unit_index+1)) + capacity_index=$((time_unit_index+2)) + target_cf_name_index=$((time_unit_index+3)) + time_unit="${elements[$time_unit_index]}_" + capacity="${elements[$capacity_index]}_" + target_cf_name="${elements[$target_cf_name_index]}_" + fi + + if [[ $fn == *"${header}ml-policy-timeline"* ]]; then + sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_timeline" + fi + if [[ $fn == *"${header}ml-policy-ratio-timeline"* ]]; then + sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_ratio_timeline" + fi + if [[ $fn == *"${header}ml-miss-timeline"* ]]; then + sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_timeline" + fi + if [[ $fn == *"${header}ml-miss-ratio-timeline"* ]]; then + sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_ratio_timeline" + fi + if [[ $fn == *"${header}ml-mrc"* ]]; then + tmpfn="$fn" + IFS='-' read -ra elements <<< "$tmpfn" + target_cf_name=${elements[-1]} + sum_file="${result_dir}/ml_${target_cf_name}_mrc" + fi + if [[ $fn == *"${header}ml-avgmb"* ]]; then + tmpfn="$fn" + IFS='-' read -ra elements <<< "$tmpfn" + time_unit=${elements[3]} + target_cf_name=${elements[-1]} + sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_avgmb" + fi + if [[ $fn == *"${header}ml-p95mb"* ]]; then + tmpfn="$fn" + IFS='-' read -ra elements <<< "$tmpfn" + time_unit=${elements[3]} + target_cf_name=${elements[-1]} + sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_p95mb" + fi + if [[ $sum_file == "" ]]; then + continue + fi + if [[ $header == "header-" ]]; then + if [ -e "$sum_file" ]; then + continue + fi + fi + cat "$fn" >> "$sum_file" +done +done + +echo "Done" +for fn in $result_dir/* +do + if [[ $fn == *"_mrc" || $fn == *"_avgmb" || $fn == *"_p95mb" ]]; then + # Sort MRC file by cache_type and cache_size. + tmp_file="$result_dir/tmp_mrc" + cat "$fn" | sort -t ',' -k1,1 -k4,4n > "$tmp_file" + cat "$tmp_file" > "$fn" + rm -rf "$tmp_file" + fi +done diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py new file mode 100644 index 000000000..eed1b94af --- /dev/null +++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py @@ -0,0 +1,734 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import os +import random +import sys + +from block_cache_pysim import ( + ARCCache, + CacheEntry, + create_cache, + GDSizeCache, + HashTable, + HyperbolicPolicy, + kMicrosInSecond, + kSampleSize, + LFUPolicy, + LinUCBCache, + LRUCache, + LRUPolicy, + MRUPolicy, + OPTCache, + OPTCacheEntry, + run, + ThompsonSamplingCache, + TraceCache, + TraceRecord, +) + + +def test_hash_table(): + print("Test hash table") + table = HashTable() + data_size = 10000 + for i in range(data_size): + table.insert("k{}".format(i), i, "v{}".format(i)) + for i in range(data_size): + assert table.lookup("k{}".format(i), i) is not None + for i in range(data_size): + table.delete("k{}".format(i), i) + for i in range(data_size): + assert table.lookup("k{}".format(i), i) is None + + truth_map = {} + n = 1000000 + records = 100 + for i in range(n): + key_id = random.randint(0, records) + v = random.randint(0, records) + key = "k{}".format(key_id) + value = CacheEntry(v, v, v, v, v, v, v) + action = random.randint(0, 10) + assert len(truth_map) == table.elements, "{} {} {}".format( + len(truth_map), table.elements, i + ) + if action <= 8: + if key in truth_map: + assert table.lookup(key, key_id) is not None + assert truth_map[key].value_size == table.lookup(key, key_id).value_size + else: + assert table.lookup(key, key_id) is None + table.insert(key, key_id, value) + truth_map[key] = value + else: + deleted = table.delete(key, key_id) + if deleted: + assert key in truth_map + if key in truth_map: + del truth_map[key] + + # Check all keys are unique in the sample set. + for _i in range(10): + samples = table.random_sample(kSampleSize) + unique_keys = {} + for sample in samples: + unique_keys[sample.key] = True + assert len(samples) == len(unique_keys) + + assert len(table) == len(truth_map) + for key in truth_map: + assert table.lookup(key, int(key[1:])) is not None + assert truth_map[key].value_size == table.lookup(key, int(key[1:])).value_size + print("Test hash table: Success") + + +def assert_metrics(cache, expected_value, expected_value_size=1, custom_hashtable=True): + assert cache.used_size == expected_value[0], "Expected {}, Actual {}".format( + expected_value[0], cache.used_size + ) + assert ( + cache.miss_ratio_stats.num_accesses == expected_value[1] + ), "Expected {}, Actual {}".format( + expected_value[1], cache.miss_ratio_stats.num_accesses + ) + assert ( + cache.miss_ratio_stats.num_misses == expected_value[2] + ), "Expected {}, Actual {}".format( + expected_value[2], cache.miss_ratio_stats.num_misses + ) + assert len(cache.table) == len(expected_value[3]) + len( + expected_value[4] + ), "Expected {}, Actual {}".format( + len(expected_value[3]) + len(expected_value[4]), cache.table.elements + ) + for expeceted_k in expected_value[3]: + if custom_hashtable: + val = cache.table.lookup("b{}".format(expeceted_k), expeceted_k) + else: + val = cache.table["b{}".format(expeceted_k)] + assert val is not None, "Expected {} Actual: Not Exist {}, Table: {}".format( + expeceted_k, expected_value, cache.table + ) + assert val.value_size == expected_value_size + for expeceted_k in expected_value[4]: + if custom_hashtable: + val = cache.table.lookup("g0-{}".format(expeceted_k), expeceted_k) + else: + val = cache.table["g0-{}".format(expeceted_k)] + assert val is not None + assert val.value_size == expected_value_size + + +# Access k1, k1, k2, k3, k3, k3, k4 +# When k4 is inserted, +# LRU should evict k1. +# LFU should evict k2. +# MRU should evict k3. +def test_cache(cache, expected_value, custom_hashtable=True): + k1 = TraceRecord( + access_time=0, + block_id=1, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, + key_id=1, + kv_size=5, + is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=0, + ) + k2 = TraceRecord( + access_time=1, + block_id=2, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, + key_id=1, + kv_size=5, + is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=0, + ) + k3 = TraceRecord( + access_time=2, + block_id=3, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, + key_id=1, + kv_size=5, + is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=0, + ) + k4 = TraceRecord( + access_time=3, + block_id=4, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, + key_id=1, + kv_size=5, + is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=0, + ) + sequence = [k1, k1, k2, k3, k3, k3] + index = 0 + expected_values = [] + # Access k1, miss. + expected_values.append([1, 1, 1, [1], []]) + # Access k1, hit. + expected_values.append([1, 2, 1, [1], []]) + # Access k2, miss. + expected_values.append([2, 3, 2, [1, 2], []]) + # Access k3, miss. + expected_values.append([3, 4, 3, [1, 2, 3], []]) + # Access k3, hit. + expected_values.append([3, 5, 3, [1, 2, 3], []]) + # Access k3, hit. + expected_values.append([3, 6, 3, [1, 2, 3], []]) + access_time = 0 + for access in sequence: + access.access_time = access_time + cache.access(access) + assert_metrics( + cache, + expected_values[index], + expected_value_size=1, + custom_hashtable=custom_hashtable, + ) + access_time += 1 + index += 1 + k4.access_time = access_time + cache.access(k4) + assert_metrics( + cache, expected_value, expected_value_size=1, custom_hashtable=custom_hashtable + ) + + +def test_lru_cache(cache, custom_hashtable): + print("Test LRU cache") + # Access k4, miss. evict k1 + test_cache(cache, [3, 7, 4, [2, 3, 4], []], custom_hashtable) + print("Test LRU cache: Success") + + +def test_mru_cache(): + print("Test MRU cache") + policies = [] + policies.append(MRUPolicy()) + # Access k4, miss. evict k3 + test_cache( + ThompsonSamplingCache(3, False, policies, cost_class_label=None), + [3, 7, 4, [1, 2, 4], []], + ) + print("Test MRU cache: Success") + + +def test_lfu_cache(): + print("Test LFU cache") + policies = [] + policies.append(LFUPolicy()) + # Access k4, miss. evict k2 + test_cache( + ThompsonSamplingCache(3, False, policies, cost_class_label=None), + [3, 7, 4, [1, 3, 4], []], + ) + print("Test LFU cache: Success") + + +def test_mix(cache): + print("Test Mix {} cache".format(cache.cache_name())) + n = 100000 + records = 100 + block_size_table = {} + trace_num_misses = 0 + for i in range(n): + key_id = random.randint(0, records) + vs = random.randint(0, 10) + now = i * kMicrosInSecond + block_size = vs + if key_id in block_size_table: + block_size = block_size_table[key_id] + else: + block_size_table[key_id] = block_size + is_hit = key_id % 2 + if is_hit == 0: + trace_num_misses += 1 + k = TraceRecord( + access_time=now, + block_id=key_id, + block_type=1, + block_size=block_size, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=key_id, + key_id=key_id, + kv_size=5, + is_hit=is_hit, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=vs, + ) + cache.access(k) + assert cache.miss_ratio_stats.miss_ratio() > 0 + if cache.cache_name() == "Trace": + assert cache.miss_ratio_stats.num_accesses == n + assert cache.miss_ratio_stats.num_misses == trace_num_misses + else: + assert cache.used_size <= cache.cache_size + all_values = cache.table.values() + cached_size = 0 + for value in all_values: + cached_size += value.value_size + assert cached_size == cache.used_size, "Expeced {} Actual {}".format( + cache.used_size, cached_size + ) + print("Test Mix {} cache: Success".format(cache.cache_name())) + + +def test_end_to_end(): + print("Test All caches") + n = 100000 + nblocks = 1000 + block_size = 16 * 1024 + ncfs = 7 + nlevels = 6 + nfds = 100000 + trace_file_path = "test_trace" + # All blocks are of the same size so that OPT must achieve the lowest miss + # ratio. + with open(trace_file_path, "w+") as trace_file: + access_records = "" + for i in range(n): + key_id = random.randint(0, nblocks) + cf_id = random.randint(0, ncfs) + level = random.randint(0, nlevels) + fd = random.randint(0, nfds) + now = i * kMicrosInSecond + access_record = "" + access_record += "{},".format(now) + access_record += "{},".format(key_id) + access_record += "{},".format(9) # block type + access_record += "{},".format(block_size) # block size + access_record += "{},".format(cf_id) + access_record += "cf_{},".format(cf_id) + access_record += "{},".format(level) + access_record += "{},".format(fd) + access_record += "{},".format(key_id % 3) # caller + access_record += "{},".format(0) # no insert + access_record += "{},".format(i) # get_id + access_record += "{},".format(i) # key_id + access_record += "{},".format(100) # kv_size + access_record += "{},".format(1) # is_hit + access_record += "{},".format(1) # referenced_key_exist_in_block + access_record += "{},".format(10) # num_keys_in_block + access_record += "{},".format(1) # table_id + access_record += "{},".format(0) # seq_number + access_record += "{},".format(10) # block key size + access_record += "{},".format(20) # key size + access_record += "{},".format(0) # block offset + access_record = access_record[:-1] + access_records += access_record + "\n" + trace_file.write(access_records) + + print("Test All caches: Start testing caches") + cache_size = block_size * nblocks / 10 + downsample_size = 1 + cache_ms = {} + for cache_type in [ + "ts", + "opt", + "lru", + "pylru", + "linucb", + "gdsize", + "pyccbt", + "pycctbbt", + ]: + cache = create_cache(cache_type, cache_size, downsample_size) + run(trace_file_path, cache_type, cache, 0, -1, "all") + cache_ms[cache_type] = cache + assert cache.miss_ratio_stats.num_accesses == n + + for cache_type in cache_ms: + cache = cache_ms[cache_type] + ms = cache.miss_ratio_stats.miss_ratio() + assert ms <= 100.0 and ms >= 0.0 + # OPT should perform the best. + assert cache_ms["opt"].miss_ratio_stats.miss_ratio() <= ms + assert cache.used_size <= cache.cache_size + all_values = cache.table.values() + cached_size = 0 + for value in all_values: + cached_size += value.value_size + assert cached_size == cache.used_size, "Expeced {} Actual {}".format( + cache.used_size, cached_size + ) + print("Test All {}: Success".format(cache.cache_name())) + + os.remove(trace_file_path) + print("Test All: Success") + + +def test_hybrid(cache): + print("Test {} cache".format(cache.cache_name())) + k = TraceRecord( + access_time=0, + block_id=1, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, # the first get request. + key_id=1, + kv_size=0, # no size. + is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=0, + ) + cache.access(k) # Expect a miss. + # used size, num accesses, num misses, hash table size, blocks, get keys. + assert_metrics(cache, [1, 1, 1, [1], []]) + k.access_time += 1 + k.kv_size = 1 + k.block_id = 2 + cache.access(k) # k should be inserted. + assert_metrics(cache, [3, 2, 2, [1, 2], [1]]) + k.access_time += 1 + k.block_id = 3 + cache.access(k) # k should not be inserted again. + assert_metrics(cache, [4, 3, 3, [1, 2, 3], [1]]) + # A second get request referencing the same key. + k.access_time += 1 + k.get_id = 2 + k.block_id = 4 + k.kv_size = 0 + cache.access(k) # k should observe a hit. No block access. + assert_metrics(cache, [4, 4, 3, [1, 2, 3], [1]]) + + # A third get request searches three files, three different keys. + # And the second key observes a hit. + k.access_time += 1 + k.kv_size = 1 + k.get_id = 3 + k.block_id = 3 + k.key_id = 2 + cache.access(k) # k should observe a miss. block 3 observes a hit. + assert_metrics(cache, [5, 5, 3, [1, 2, 3], [1, 2]]) + + k.access_time += 1 + k.kv_size = 1 + k.get_id = 3 + k.block_id = 4 + k.kv_size = 1 + k.key_id = 1 + cache.access(k) # k1 should observe a hit. + assert_metrics(cache, [5, 6, 3, [1, 2, 3], [1, 2]]) + + k.access_time += 1 + k.kv_size = 1 + k.get_id = 3 + k.block_id = 4 + k.kv_size = 1 + k.key_id = 3 + # k3 should observe a miss. + # However, as the get already complete, we should not access k3 any more. + cache.access(k) + assert_metrics(cache, [5, 7, 3, [1, 2, 3], [1, 2]]) + + # A fourth get request searches one file and two blocks. One row key. + k.access_time += 1 + k.get_id = 4 + k.block_id = 5 + k.key_id = 4 + k.kv_size = 1 + cache.access(k) + assert_metrics(cache, [7, 8, 4, [1, 2, 3, 5], [1, 2, 4]]) + + # A bunch of insertions which evict cached row keys. + for i in range(6, 100): + k.access_time += 1 + k.get_id = 0 + k.block_id = i + cache.access(k) + + k.get_id = 4 + k.block_id = 100 # A different block. + k.key_id = 4 # Same row key and should not be inserted again. + k.kv_size = 1 + cache.access(k) + assert_metrics( + cache, [kSampleSize, 103, 99, [i for i in range(101 - kSampleSize, 101)], []] + ) + print("Test {} cache: Success".format(cache.cache_name())) + + +def test_opt_cache(): + print("Test OPT cache") + cache = OPTCache(3) + # seq: 0, 1, 2, 3, 4, 5, 6, 7, 8 + # key: k1, k2, k3, k4, k5, k6, k7, k1, k8 + # next_access: 7, 19, 18, M, M, 17, 16, 25, M + k = TraceRecord( + access_time=0, + block_id=1, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, # the first get request. + key_id=1, + kv_size=0, # no size. + is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=7, + ) + cache.access(k) + assert_metrics( + cache, [1, 1, 1, [1], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 2 + k.next_access_seq_no = 19 + cache.access(k) + assert_metrics( + cache, [2, 2, 2, [1, 2], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 3 + k.next_access_seq_no = 18 + cache.access(k) + assert_metrics( + cache, [3, 3, 3, [1, 2, 3], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 4 + k.next_access_seq_no = sys.maxsize # Never accessed again. + cache.access(k) + # Evict 2 since its next access 19 is the furthest in the future. + assert_metrics( + cache, [3, 4, 4, [1, 3, 4], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 5 + k.next_access_seq_no = sys.maxsize # Never accessed again. + cache.access(k) + # Evict 4 since its next access MAXINT is the furthest in the future. + assert_metrics( + cache, [3, 5, 5, [1, 3, 5], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 6 + k.next_access_seq_no = 17 + cache.access(k) + # Evict 5 since its next access MAXINT is the furthest in the future. + assert_metrics( + cache, [3, 6, 6, [1, 3, 6], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 7 + k.next_access_seq_no = 16 + cache.access(k) + # Evict 3 since its next access 18 is the furthest in the future. + assert_metrics( + cache, [3, 7, 7, [1, 6, 7], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 1 + k.next_access_seq_no = 25 + cache.access(k) + assert_metrics( + cache, [3, 8, 7, [1, 6, 7], []], expected_value_size=1, custom_hashtable=False + ) + k.access_time += 1 + k.block_id = 8 + k.next_access_seq_no = sys.maxsize + cache.access(k) + # Evict 1 since its next access 25 is the furthest in the future. + assert_metrics( + cache, [3, 9, 8, [6, 7, 8], []], expected_value_size=1, custom_hashtable=False + ) + + # Insert a large kv pair to evict all keys. + k.access_time += 1 + k.block_id = 10 + k.block_size = 3 + k.next_access_seq_no = sys.maxsize + cache.access(k) + assert_metrics( + cache, [3, 10, 9, [10], []], expected_value_size=3, custom_hashtable=False + ) + print("Test OPT cache: Success") + + +def test_trace_cache(): + print("Test trace cache") + cache = TraceCache(0) + k = TraceRecord( + access_time=0, + block_id=1, + block_type=1, + block_size=1, + cf_id=0, + cf_name="", + level=0, + fd=0, + caller=1, + no_insert=0, + get_id=1, + key_id=1, + kv_size=0, + is_hit=1, + referenced_key_exist_in_block=1, + num_keys_in_block=0, + table_id=0, + seq_number=0, + block_key_size=0, + key_size=0, + block_offset_in_file=0, + next_access_seq_no=7, + ) + cache.access(k) + assert cache.miss_ratio_stats.num_accesses == 1 + assert cache.miss_ratio_stats.num_misses == 0 + k.is_hit = 0 + cache.access(k) + assert cache.miss_ratio_stats.num_accesses == 2 + assert cache.miss_ratio_stats.num_misses == 1 + print("Test trace cache: Success") + + +if __name__ == "__main__": + test_hash_table() + test_trace_cache() + test_opt_cache() + test_lru_cache( + ThompsonSamplingCache( + 3, enable_cache_row_key=0, policies=[LRUPolicy()], cost_class_label=None + ), + custom_hashtable=True, + ) + test_lru_cache(LRUCache(3, enable_cache_row_key=0), custom_hashtable=False) + test_mru_cache() + test_lfu_cache() + test_hybrid( + ThompsonSamplingCache( + kSampleSize, + enable_cache_row_key=1, + policies=[LRUPolicy()], + cost_class_label=None, + ) + ) + test_hybrid( + LinUCBCache( + kSampleSize, + enable_cache_row_key=1, + policies=[LRUPolicy()], + cost_class_label=None, + ) + ) + for cache_type in [ + "ts", + "opt", + "arc", + "pylfu", + "pymru", + "trace", + "pyhb", + "lru", + "pylru", + "linucb", + "gdsize", + "pycctbbt", + "pycctb", + "pyccbt", + ]: + for enable_row_cache in [0, 1, 2]: + cache_type_str = cache_type + if cache_type != "opt" and cache_type != "trace": + if enable_row_cache == 1: + cache_type_str += "_hybrid" + elif enable_row_cache == 2: + cache_type_str += "_hybridn" + test_mix(create_cache(cache_type_str, cache_size=100, downsample_size=1)) + test_end_to_end() diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc new file mode 100644 index 000000000..f0bb6975b --- /dev/null +++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc @@ -0,0 +1,2316 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#ifdef GFLAGS +#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h" + +#include <algorithm> +#include <cinttypes> +#include <cstdio> +#include <cstdlib> +#include <fstream> +#include <iomanip> +#include <iostream> +#include <memory> +#include <random> +#include <sstream> + +#include "monitoring/histogram.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/trace_record.h" +#include "util/gflags_compat.h" +#include "util/string_util.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_string(block_cache_trace_path, "", "The trace file path."); +DEFINE_bool(is_block_cache_human_readable_trace, false, + "Is the trace file provided for analysis generated by running " + "block_cache_trace_analyzer with " + "FLAGS_human_readable_trace_file_path is specified."); +DEFINE_string( + block_cache_sim_config_path, "", + "The config file path. One cache configuration per line. The format of a " + "cache configuration is " + "cache_name,num_shard_bits,ghost_capacity,cache_capacity_1,...,cache_" + "capacity_N. Supported cache names are lru, lru_priority, lru_hybrid, and " + "lru_hybrid_no_insert_on_row_miss. User may also add a prefix 'ghost_' to " + "a cache_name to add a ghost cache in front of the real cache. " + "ghost_capacity and cache_capacity can be xK, xM or xG where x is a " + "positive number."); +DEFINE_int32(block_cache_trace_downsample_ratio, 1, + "The trace collected accesses on one in every " + "block_cache_trace_downsample_ratio blocks. We scale " + "down the simulated cache size by this ratio."); +DEFINE_bool(print_block_size_stats, false, + "Print block size distribution and the distribution break down by " + "block type and column family."); +DEFINE_bool(print_access_count_stats, false, + "Print access count distribution and the distribution break down " + "by block type and column family."); +DEFINE_bool(print_data_block_access_count_stats, false, + "Print data block accesses by user Get and Multi-Get."); +DEFINE_int32(cache_sim_warmup_seconds, 0, + "The number of seconds to warmup simulated caches. The hit/miss " + "counters are reset after the warmup completes."); +DEFINE_int32(analyze_bottom_k_access_count_blocks, 0, + "Print out detailed access information for blocks with their " + "number of accesses are the bottom k among all blocks."); +DEFINE_int32(analyze_top_k_access_count_blocks, 0, + "Print out detailed access information for blocks with their " + "number of accesses are the top k among all blocks."); +DEFINE_string(block_cache_analysis_result_dir, "", + "The directory that saves block cache analysis results."); +DEFINE_string( + timeline_labels, "", + "Group the number of accesses per block per second using these labels. " + "Possible labels are a combination of the following: cf (column family), " + "sst, level, bt (block type), caller, block. For example, label \"cf_bt\" " + "means the number of access per second is grouped by unique pairs of " + "\"cf_bt\". A label \"all\" contains the aggregated number of accesses per " + "second across all possible labels."); +DEFINE_string(reuse_distance_labels, "", + "Group the reuse distance of a block using these labels. Reuse " + "distance is defined as the cumulated size of unique blocks read " + "between two consecutive accesses on the same block."); +DEFINE_string( + reuse_distance_buckets, "", + "Group blocks by their reuse distances given these buckets. For " + "example, if 'reuse_distance_buckets' is '1K,1M,1G', we will " + "create four buckets. The first three buckets contain the number of " + "blocks with reuse distance less than 1KB, between 1K and 1M, between 1M " + "and 1G, respectively. The last bucket contains the number of blocks with " + "reuse distance larger than 1G. "); +DEFINE_string( + reuse_interval_labels, "", + "Group the reuse interval of a block using these labels. Reuse " + "interval is defined as the time between two consecutive accesses " + "on the same block."); +DEFINE_string( + reuse_interval_buckets, "", + "Group blocks by their reuse interval given these buckets. For " + "example, if 'reuse_distance_buckets' is '1,10,100', we will " + "create four buckets. The first three buckets contain the number of " + "blocks with reuse interval less than 1 second, between 1 second and 10 " + "seconds, between 10 seconds and 100 seconds, respectively. The last " + "bucket contains the number of blocks with reuse interval longer than 100 " + "seconds."); +DEFINE_string( + reuse_lifetime_labels, "", + "Group the reuse lifetime of a block using these labels. Reuse " + "lifetime is defined as the time interval between the first access on a " + "block and the last access on the same block. For blocks that are only " + "accessed once, its lifetime is set to kMaxUint64."); +DEFINE_string( + reuse_lifetime_buckets, "", + "Group blocks by their reuse lifetime given these buckets. For " + "example, if 'reuse_lifetime_buckets' is '1,10,100', we will " + "create four buckets. The first three buckets contain the number of " + "blocks with reuse lifetime less than 1 second, between 1 second and 10 " + "seconds, between 10 seconds and 100 seconds, respectively. The last " + "bucket contains the number of blocks with reuse lifetime longer than 100 " + "seconds."); +DEFINE_string( + analyze_callers, "", + "The list of callers to perform a detailed analysis on. If speicfied, the " + "analyzer will output a detailed percentage of accesses for each caller " + "break down by column family, level, and block type. A list of available " + "callers are: Get, MultiGet, Iterator, ApproximateSize, VerifyChecksum, " + "SSTDumpTool, ExternalSSTIngestion, Repair, Prefetch, Compaction, " + "CompactionRefill, Flush, SSTFileReader, Uncategorized."); +DEFINE_string(access_count_buckets, "", + "Group number of blocks by their access count given these " + "buckets. If specified, the analyzer will output a detailed " + "analysis on the number of blocks grouped by their access count " + "break down by block type and column family."); +DEFINE_int32(analyze_blocks_reuse_k_reuse_window, 0, + "Analyze the percentage of blocks that are accessed in the " + "[k, 2*k] seconds are accessed again in the next [2*k, 3*k], " + "[3*k, 4*k],...,[k*(n-1), k*n] seconds. "); +DEFINE_string(analyze_get_spatial_locality_labels, "", + "Group data blocks using these labels."); +DEFINE_string(analyze_get_spatial_locality_buckets, "", + "Group data blocks by their statistics using these buckets."); +DEFINE_string(skew_labels, "", + "Group the access count of a block using these labels."); +DEFINE_string(skew_buckets, "", "Group the skew labels using these buckets."); +DEFINE_bool(mrc_only, false, + "Evaluate alternative cache policies only. When this flag is true, " + "the analyzer does NOT maintain states of each block in memory for " + "analysis. It only feeds the accesses into the cache simulators."); +DEFINE_string( + analyze_correlation_coefficients_labels, "", + "Analyze the correlation coefficients of features such as number of past " + "accesses with regard to the number of accesses till the next access."); +DEFINE_int32(analyze_correlation_coefficients_max_number_of_values, 1000000, + "The maximum number of values for a feature. If the number of " + "values for a feature is larger than this max, it randomly " + "selects 'max' number of values."); +DEFINE_string(human_readable_trace_file_path, "", + "The filt path that saves human readable access records."); + +namespace ROCKSDB_NAMESPACE { +namespace { + +const std::string kMissRatioCurveFileName = "mrc"; +const std::string kGroupbyBlock = "block"; +const std::string kGroupbyTable = "table"; +const std::string kGroupbyColumnFamily = "cf"; +const std::string kGroupbySSTFile = "sst"; +const std::string kGroupbyBlockType = "bt"; +const std::string kGroupbyCaller = "caller"; +const std::string kGroupbyLevel = "level"; +const std::string kGroupbyAll = "all"; +const std::set<std::string> kGroupbyLabels{ + kGroupbyBlock, kGroupbyColumnFamily, kGroupbySSTFile, kGroupbyLevel, + kGroupbyBlockType, kGroupbyCaller, kGroupbyAll}; +const std::string kSupportedCacheNames = + " lru ghost_lru lru_priority ghost_lru_priority lru_hybrid " + "ghost_lru_hybrid lru_hybrid_no_insert_on_row_miss " + "ghost_lru_hybrid_no_insert_on_row_miss "; + +// The suffix for the generated csv files. +const std::string kFileNameSuffixMissRatioTimeline = "miss_ratio_timeline"; +const std::string kFileNameSuffixMissTimeline = "miss_timeline"; +const std::string kFileNameSuffixSkew = "skewness"; +const std::string kFileNameSuffixAccessTimeline = "access_timeline"; +const std::string kFileNameSuffixCorrelation = "correlation_input"; +const std::string kFileNameSuffixAvgReuseIntervalNaccesses = + "avg_reuse_interval_naccesses"; +const std::string kFileNameSuffixAvgReuseInterval = "avg_reuse_interval"; +const std::string kFileNameSuffixReuseInterval = "access_reuse_interval"; +const std::string kFileNameSuffixReuseLifetime = "reuse_lifetime"; +const std::string kFileNameSuffixAccessReuseBlocksTimeline = + "reuse_blocks_timeline"; +const std::string kFileNameSuffixPercentOfAccessSummary = + "percentage_of_accesses_summary"; +const std::string kFileNameSuffixPercentRefKeys = "percent_ref_keys"; +const std::string kFileNameSuffixPercentDataSizeOnRefKeys = + "percent_data_size_on_ref_keys"; +const std::string kFileNameSuffixPercentAccessesOnRefKeys = + "percent_accesses_on_ref_keys"; +const std::string kFileNameSuffixAccessCountSummary = "access_count_summary"; + +std::string block_type_to_string(TraceType type) { + switch (type) { + case kBlockTraceFilterBlock: + return "Filter"; + case kBlockTraceDataBlock: + return "Data"; + case kBlockTraceIndexBlock: + return "Index"; + case kBlockTraceRangeDeletionBlock: + return "RangeDeletion"; + case kBlockTraceUncompressionDictBlock: + return "UncompressionDict"; + default: + break; + } + // This cannot happen. + return "InvalidType"; +} + +std::string caller_to_string(TableReaderCaller caller) { + switch (caller) { + case kUserGet: + return "Get"; + case kUserMultiGet: + return "MultiGet"; + case kUserIterator: + return "Iterator"; + case kUserApproximateSize: + return "ApproximateSize"; + case kUserVerifyChecksum: + return "VerifyChecksum"; + case kSSTDumpTool: + return "SSTDumpTool"; + case kExternalSSTIngestion: + return "ExternalSSTIngestion"; + case kRepair: + return "Repair"; + case kPrefetch: + return "Prefetch"; + case kCompaction: + return "Compaction"; + case kCompactionRefill: + return "CompactionRefill"; + case kFlush: + return "Flush"; + case kSSTFileReader: + return "SSTFileReader"; + case kUncategorized: + return "Uncategorized"; + default: + break; + } + // This cannot happen. + return "InvalidCaller"; +} + +TableReaderCaller string_to_caller(std::string caller_str) { + if (caller_str == "Get") { + return kUserGet; + } else if (caller_str == "MultiGet") { + return kUserMultiGet; + } else if (caller_str == "Iterator") { + return kUserIterator; + } else if (caller_str == "ApproximateSize") { + return kUserApproximateSize; + } else if (caller_str == "VerifyChecksum") { + return kUserVerifyChecksum; + } else if (caller_str == "SSTDumpTool") { + return kSSTDumpTool; + } else if (caller_str == "ExternalSSTIngestion") { + return kExternalSSTIngestion; + } else if (caller_str == "Repair") { + return kRepair; + } else if (caller_str == "Prefetch") { + return kPrefetch; + } else if (caller_str == "Compaction") { + return kCompaction; + } else if (caller_str == "CompactionRefill") { + return kCompactionRefill; + } else if (caller_str == "Flush") { + return kFlush; + } else if (caller_str == "SSTFileReader") { + return kSSTFileReader; + } else if (caller_str == "Uncategorized") { + return kUncategorized; + } + return TableReaderCaller::kMaxBlockCacheLookupCaller; +} + +bool is_user_access(TableReaderCaller caller) { + switch (caller) { + case kUserGet: + case kUserMultiGet: + case kUserIterator: + case kUserApproximateSize: + case kUserVerifyChecksum: + return true; + default: + break; + } + return false; +} + +const char kBreakLine[] = + "***************************************************************\n"; + +void print_break_lines(uint32_t num_break_lines) { + for (uint32_t i = 0; i < num_break_lines; i++) { + fprintf(stdout, kBreakLine); + } +} + +double percent(uint64_t numerator, uint64_t denomenator) { + if (denomenator == 0) { + return -1; + } + return static_cast<double>(numerator * 100.0 / denomenator); +} + +std::map<uint64_t, uint64_t> adjust_time_unit( + const std::map<uint64_t, uint64_t>& time_stats, uint64_t time_unit) { + if (time_unit == 1) { + return time_stats; + } + std::map<uint64_t, uint64_t> adjusted_time_stats; + for (auto const& time : time_stats) { + adjusted_time_stats[static_cast<uint64_t>(time.first / time_unit)] += + time.second; + } + return adjusted_time_stats; +} +} // namespace + +void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const { + if (!cache_simulator_) { + return; + } + if (output_dir_.empty()) { + return; + } + uint64_t trace_duration = + trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_; + uint64_t total_accesses = access_sequence_number_; + const std::string output_miss_ratio_curve_path = + output_dir_ + "/" + std::to_string(trace_duration) + "_" + + std::to_string(total_accesses) + "_" + kMissRatioCurveFileName; + std::ofstream out(output_miss_ratio_curve_path); + if (!out.is_open()) { + return; + } + // Write header. + const std::string header = + "cache_name,num_shard_bits,ghost_capacity,capacity,miss_ratio,total_" + "accesses"; + out << header << std::endl; + for (auto const& config_caches : cache_simulator_->sim_caches()) { + const CacheConfiguration& config = config_caches.first; + for (uint32_t i = 0; i < config.cache_capacities.size(); i++) { + double miss_ratio = + config_caches.second[i]->miss_ratio_stats().miss_ratio(); + // Write the body. + out << config.cache_name; + out << ","; + out << config.num_shard_bits; + out << ","; + out << config.ghost_cache_capacity; + out << ","; + out << config.cache_capacities[i]; + out << ","; + out << std::fixed << std::setprecision(4) << miss_ratio; + out << ","; + out << config_caches.second[i]->miss_ratio_stats().total_accesses(); + out << std::endl; + } + } + out.close(); +} + +void BlockCacheTraceAnalyzer::UpdateFeatureVectors( + const std::vector<uint64_t>& access_sequence_number_timeline, + const std::vector<uint64_t>& access_timeline, const std::string& label, + std::map<std::string, Features>* label_features, + std::map<std::string, Predictions>* label_predictions) const { + if (access_sequence_number_timeline.empty() || access_timeline.empty()) { + return; + } + assert(access_timeline.size() == access_sequence_number_timeline.size()); + uint64_t prev_access_sequence_number = access_sequence_number_timeline[0]; + uint64_t prev_access_timestamp = access_timeline[0]; + for (uint32_t i = 0; i < access_sequence_number_timeline.size(); i++) { + uint64_t num_accesses_since_last_access = + access_sequence_number_timeline[i] - prev_access_sequence_number; + uint64_t elapsed_time_since_last_access = + access_timeline[i] - prev_access_timestamp; + prev_access_sequence_number = access_sequence_number_timeline[i]; + prev_access_timestamp = access_timeline[i]; + if (i < access_sequence_number_timeline.size() - 1) { + (*label_features)[label].num_accesses_since_last_access.push_back( + num_accesses_since_last_access); + (*label_features)[label].num_past_accesses.push_back(i); + (*label_features)[label].elapsed_time_since_last_access.push_back( + elapsed_time_since_last_access); + } + if (i >= 1) { + (*label_predictions)[label].num_accesses_till_next_access.push_back( + num_accesses_since_last_access); + (*label_predictions)[label].elapsed_time_till_next_access.push_back( + elapsed_time_since_last_access); + } + } +} + +void BlockCacheTraceAnalyzer::WriteMissRatioTimeline(uint64_t time_unit) const { + if (!cache_simulator_ || output_dir_.empty()) { + return; + } + std::map<uint64_t, std::map<std::string, std::map<uint64_t, double>>> + cs_name_timeline; + uint64_t start_time = std::numeric_limits<uint64_t>::max(); + uint64_t end_time = 0; + const std::map<uint64_t, uint64_t>& trace_num_misses = + adjust_time_unit(miss_ratio_stats_.num_misses_timeline(), time_unit); + const std::map<uint64_t, uint64_t>& trace_num_accesses = + adjust_time_unit(miss_ratio_stats_.num_accesses_timeline(), time_unit); + assert(trace_num_misses.size() == trace_num_accesses.size()); + for (auto const& num_miss : trace_num_misses) { + uint64_t time = num_miss.first; + start_time = std::min(start_time, time); + end_time = std::max(end_time, time); + uint64_t miss = num_miss.second; + auto it = trace_num_accesses.find(time); + assert(it != trace_num_accesses.end()); + uint64_t access = it->second; + cs_name_timeline[std::numeric_limits<uint64_t>::max()]["trace"][time] = + percent(miss, access); + } + for (auto const& config_caches : cache_simulator_->sim_caches()) { + const CacheConfiguration& config = config_caches.first; + std::string cache_label = config.cache_name + "-" + + std::to_string(config.num_shard_bits) + "-" + + std::to_string(config.ghost_cache_capacity); + for (uint32_t i = 0; i < config.cache_capacities.size(); i++) { + const std::map<uint64_t, uint64_t>& num_misses = adjust_time_unit( + config_caches.second[i]->miss_ratio_stats().num_misses_timeline(), + time_unit); + const std::map<uint64_t, uint64_t>& num_accesses = adjust_time_unit( + config_caches.second[i]->miss_ratio_stats().num_accesses_timeline(), + time_unit); + assert(num_misses.size() == num_accesses.size()); + for (auto const& num_miss : num_misses) { + uint64_t time = num_miss.first; + start_time = std::min(start_time, time); + end_time = std::max(end_time, time); + uint64_t miss = num_miss.second; + auto it = num_accesses.find(time); + assert(it != num_accesses.end()); + uint64_t access = it->second; + cs_name_timeline[config.cache_capacities[i]][cache_label][time] = + percent(miss, access); + } + } + } + for (auto const& it : cs_name_timeline) { + const std::string output_miss_ratio_timeline_path = + output_dir_ + "/" + std::to_string(it.first) + "_" + + std::to_string(time_unit) + "_" + kFileNameSuffixMissRatioTimeline; + std::ofstream out(output_miss_ratio_timeline_path); + if (!out.is_open()) { + return; + } + std::string header("time"); + for (uint64_t now = start_time; now <= end_time; now++) { + header += ","; + header += std::to_string(now); + } + out << header << std::endl; + for (auto const& label : it.second) { + std::string row(label.first); + for (uint64_t now = start_time; now <= end_time; now++) { + auto misses = label.second.find(now); + row += ","; + if (misses != label.second.end()) { + row += std::to_string(misses->second); + } else { + row += "0"; + } + } + out << row << std::endl; + } + out.close(); + } +} + +void BlockCacheTraceAnalyzer::WriteMissTimeline(uint64_t time_unit) const { + if (!cache_simulator_ || output_dir_.empty()) { + return; + } + std::map<uint64_t, std::map<std::string, std::map<uint64_t, uint64_t>>> + cs_name_timeline; + uint64_t start_time = std::numeric_limits<uint64_t>::max(); + uint64_t end_time = 0; + const std::map<uint64_t, uint64_t>& trace_num_misses = + adjust_time_unit(miss_ratio_stats_.num_misses_timeline(), time_unit); + for (auto const& num_miss : trace_num_misses) { + uint64_t time = num_miss.first; + start_time = std::min(start_time, time); + end_time = std::max(end_time, time); + uint64_t miss = num_miss.second; + cs_name_timeline[std::numeric_limits<uint64_t>::max()]["trace"][time] = + miss; + } + for (auto const& config_caches : cache_simulator_->sim_caches()) { + const CacheConfiguration& config = config_caches.first; + std::string cache_label = config.cache_name + "-" + + std::to_string(config.num_shard_bits) + "-" + + std::to_string(config.ghost_cache_capacity); + for (uint32_t i = 0; i < config.cache_capacities.size(); i++) { + const std::map<uint64_t, uint64_t>& num_misses = adjust_time_unit( + config_caches.second[i]->miss_ratio_stats().num_misses_timeline(), + time_unit); + for (auto const& num_miss : num_misses) { + uint64_t time = num_miss.first; + start_time = std::min(start_time, time); + end_time = std::max(end_time, time); + uint64_t miss = num_miss.second; + cs_name_timeline[config.cache_capacities[i]][cache_label][time] = miss; + } + } + } + for (auto const& it : cs_name_timeline) { + const std::string output_miss_ratio_timeline_path = + output_dir_ + "/" + std::to_string(it.first) + "_" + + std::to_string(time_unit) + "_" + kFileNameSuffixMissTimeline; + std::ofstream out(output_miss_ratio_timeline_path); + if (!out.is_open()) { + return; + } + std::string header("time"); + for (uint64_t now = start_time; now <= end_time; now++) { + header += ","; + header += std::to_string(now); + } + out << header << std::endl; + for (auto const& label : it.second) { + std::string row(label.first); + for (uint64_t now = start_time; now <= end_time; now++) { + auto misses = label.second.find(now); + row += ","; + if (misses != label.second.end()) { + row += std::to_string(misses->second); + } else { + row += "0"; + } + } + out << row << std::endl; + } + out.close(); + } +} + +void BlockCacheTraceAnalyzer::WriteSkewness( + const std::string& label_str, const std::vector<uint64_t>& percent_buckets, + TraceType target_block_type) const { + std::set<std::string> labels = ParseLabelStr(label_str); + std::map<std::string, uint64_t> label_naccesses; + uint64_t total_naccesses = 0; + auto block_callback = [&](const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + const std::string& /*block_key*/, uint64_t block_id, + const BlockAccessInfo& block) { + if (target_block_type != TraceType::kTraceMax && + target_block_type != type) { + return; + } + const std::string label = BuildLabel( + labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block); + label_naccesses[label] += block.num_accesses; + total_naccesses += block.num_accesses; + }; + TraverseBlocks(block_callback, &labels); + std::map<std::string, std::map<uint64_t, uint64_t>> label_bucket_naccesses; + std::vector<std::pair<std::string, uint64_t>> pairs; + for (auto const& itr : label_naccesses) { + pairs.push_back(itr); + } + // Sort in descending order. + sort(pairs.begin(), pairs.end(), + [](const std::pair<std::string, uint64_t>& a, + const std::pair<std::string, uint64_t>& b) { + return b.second < a.second; + }); + + size_t prev_start_index = 0; + for (auto const& percent : percent_buckets) { + label_bucket_naccesses[label_str][percent] = 0; + size_t end_index = 0; + if (percent == std::numeric_limits<uint64_t>::max()) { + end_index = label_naccesses.size(); + } else { + end_index = percent * label_naccesses.size() / 100; + } + for (size_t i = prev_start_index; i < end_index; i++) { + label_bucket_naccesses[label_str][percent] += pairs[i].second; + } + prev_start_index = end_index; + } + std::string filename_suffix; + if (target_block_type != TraceType::kTraceMax) { + filename_suffix = block_type_to_string(target_block_type); + filename_suffix += "_"; + } + filename_suffix += kFileNameSuffixSkew; + WriteStatsToFile(label_str, percent_buckets, filename_suffix, + label_bucket_naccesses, total_naccesses); +} + +void BlockCacheTraceAnalyzer::WriteCorrelationFeatures( + const std::string& label_str, uint32_t max_number_of_values) const { + std::set<std::string> labels = ParseLabelStr(label_str); + std::map<std::string, Features> label_features; + std::map<std::string, Predictions> label_predictions; + auto block_callback = + [&](const std::string& cf_name, uint64_t fd, uint32_t level, + TraceType block_type, const std::string& /*block_key*/, + uint64_t /*block_key_id*/, const BlockAccessInfo& block) { + if (block.table_id == 0 && labels.find(kGroupbyTable) != labels.end()) { + // We only know table id information for get requests. + return; + } + if (labels.find(kGroupbyCaller) != labels.end()) { + // Group by caller. + for (auto const& caller_map : block.caller_access_timeline) { + const std::string label = + BuildLabel(labels, cf_name, fd, level, block_type, + caller_map.first, /*block_id=*/0, block); + auto it = block.caller_access_sequence__number_timeline.find( + caller_map.first); + assert(it != block.caller_access_sequence__number_timeline.end()); + UpdateFeatureVectors(it->second, caller_map.second, label, + &label_features, &label_predictions); + } + return; + } + const std::string label = + BuildLabel(labels, cf_name, fd, level, block_type, + TableReaderCaller::kMaxBlockCacheLookupCaller, + /*block_id=*/0, block); + UpdateFeatureVectors(block.access_sequence_number_timeline, + block.access_timeline, label, &label_features, + &label_predictions); + }; + TraverseBlocks(block_callback, &labels); + WriteCorrelationFeaturesToFile(label_str, label_features, label_predictions, + max_number_of_values); +} + +void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesToFile( + const std::string& label, + const std::map<std::string, Features>& label_features, + const std::map<std::string, Predictions>& label_predictions, + uint32_t max_number_of_values) const { + for (auto const& label_feature_vectors : label_features) { + const Features& past = label_feature_vectors.second; + auto it = label_predictions.find(label_feature_vectors.first); + assert(it != label_predictions.end()); + const Predictions& future = it->second; + const std::string output_path = output_dir_ + "/" + label + "_" + + label_feature_vectors.first + "_" + + kFileNameSuffixCorrelation; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header( + "num_accesses_since_last_access,elapsed_time_since_last_access,num_" + "past_accesses,num_accesses_till_next_access,elapsed_time_till_next_" + "access"); + out << header << std::endl; + std::vector<uint32_t> indexes; + for (uint32_t i = 0; i < past.num_accesses_since_last_access.size(); i++) { + indexes.push_back(i); + } + RandomShuffle(indexes.begin(), indexes.end()); + for (uint32_t i = 0; i < max_number_of_values && i < indexes.size(); i++) { + uint32_t rand_index = indexes[i]; + out << std::to_string(past.num_accesses_since_last_access[rand_index]) + << ","; + out << std::to_string(past.elapsed_time_since_last_access[rand_index]) + << ","; + out << std::to_string(past.num_past_accesses[rand_index]) << ","; + out << std::to_string(future.num_accesses_till_next_access[rand_index]) + << ","; + out << std::to_string(future.elapsed_time_till_next_access[rand_index]) + << std::endl; + } + out.close(); + } +} + +void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesForGet( + uint32_t max_number_of_values) const { + std::string label = "GetKeyInfo"; + std::map<std::string, Features> label_features; + std::map<std::string, Predictions> label_predictions; + for (auto const& get_info : get_key_info_map_) { + const GetKeyInfo& info = get_info.second; + UpdateFeatureVectors(info.access_sequence_number_timeline, + info.access_timeline, label, &label_features, + &label_predictions); + } + WriteCorrelationFeaturesToFile(label, label_features, label_predictions, + max_number_of_values); +} + +std::set<std::string> BlockCacheTraceAnalyzer::ParseLabelStr( + const std::string& label_str) const { + std::stringstream ss(label_str); + std::set<std::string> labels; + // label_str is in the form of "label1_label2_label3", e.g., cf_bt. + while (ss.good()) { + std::string label_name; + getline(ss, label_name, '_'); + if (kGroupbyLabels.find(label_name) == kGroupbyLabels.end()) { + // Unknown label name. + fprintf(stderr, "Unknown label name %s, label string %s\n", + label_name.c_str(), label_str.c_str()); + return {}; + } + labels.insert(label_name); + } + return labels; +} + +std::string BlockCacheTraceAnalyzer::BuildLabel( + const std::set<std::string>& labels, const std::string& cf_name, + uint64_t fd, uint32_t level, TraceType type, TableReaderCaller caller, + uint64_t block_key, const BlockAccessInfo& block) const { + std::map<std::string, std::string> label_value_map; + label_value_map[kGroupbyAll] = kGroupbyAll; + label_value_map[kGroupbyLevel] = std::to_string(level); + label_value_map[kGroupbyCaller] = caller_to_string(caller); + label_value_map[kGroupbySSTFile] = std::to_string(fd); + label_value_map[kGroupbyBlockType] = block_type_to_string(type); + label_value_map[kGroupbyColumnFamily] = cf_name; + label_value_map[kGroupbyBlock] = std::to_string(block_key); + label_value_map[kGroupbyTable] = std::to_string(block.table_id); + // Concatenate the label values. + std::string label; + for (auto const& l : labels) { + label += label_value_map[l]; + label += "-"; + } + if (!label.empty()) { + label.pop_back(); + } + return label; +} + +void BlockCacheTraceAnalyzer::TraverseBlocks( + std::function<void(const std::string& /*cf_name*/, uint64_t /*fd*/, + uint32_t /*level*/, TraceType /*block_type*/, + const std::string& /*block_key*/, + uint64_t /*block_key_id*/, + const BlockAccessInfo& /*block_access_info*/)> + block_callback, + std::set<std::string>* labels) const { + for (auto const& cf_aggregates : cf_aggregates_map_) { + // Stats per column family. + const std::string& cf_name = cf_aggregates.first; + for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + // Stats per SST file. + const uint64_t fd = file_aggregates.first; + const uint32_t level = file_aggregates.second.level; + for (auto const& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + // Stats per block type. + const TraceType type = block_type_aggregates.first; + for (auto const& block_access_info : + block_type_aggregates.second.block_access_info_map) { + // Stats per block. + if (labels && block_access_info.second.table_id == 0 && + labels->find(kGroupbyTable) != labels->end()) { + // We only know table id information for get requests. + return; + } + block_callback(cf_name, fd, level, type, block_access_info.first, + block_access_info.second.block_id, + block_access_info.second); + } + } + } + } +} + +void BlockCacheTraceAnalyzer::WriteGetSpatialLocality( + const std::string& label_str, + const std::vector<uint64_t>& percent_buckets) const { + std::set<std::string> labels = ParseLabelStr(label_str); + std::map<std::string, std::map<uint64_t, uint64_t>> label_pnrefkeys_nblocks; + std::map<std::string, std::map<uint64_t, uint64_t>> label_pnrefs_nblocks; + std::map<std::string, std::map<uint64_t, uint64_t>> label_pndatasize_nblocks; + uint64_t nblocks = 0; + auto block_callback = [&](const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType /*block_type*/, + const std::string& /*block_key*/, + uint64_t /*block_key_id*/, + const BlockAccessInfo& block) { + if (block.num_keys == 0) { + return; + } + uint64_t naccesses = 0; + for (auto const& key_access : block.key_num_access_map) { + for (auto const& caller_access : key_access.second) { + if (caller_access.first == TableReaderCaller::kUserGet) { + naccesses += caller_access.second; + } + } + } + const std::string label = + BuildLabel(labels, cf_name, fd, level, TraceType::kBlockTraceDataBlock, + TableReaderCaller::kUserGet, /*block_id=*/0, block); + + const uint64_t percent_referenced_for_existing_keys = + static_cast<uint64_t>(std::max( + percent(block.key_num_access_map.size(), block.num_keys), 0.0)); + const uint64_t percent_accesses_for_existing_keys = + static_cast<uint64_t>(std::max( + percent(block.num_referenced_key_exist_in_block, naccesses), 0.0)); + const uint64_t percent_referenced_data_size = static_cast<uint64_t>( + std::max(percent(block.referenced_data_size, block.block_size), 0.0)); + if (label_pnrefkeys_nblocks.find(label) == label_pnrefkeys_nblocks.end()) { + for (auto const& percent_bucket : percent_buckets) { + label_pnrefkeys_nblocks[label][percent_bucket] = 0; + label_pnrefs_nblocks[label][percent_bucket] = 0; + label_pndatasize_nblocks[label][percent_bucket] = 0; + } + } + label_pnrefkeys_nblocks[label] + .upper_bound(percent_referenced_for_existing_keys) + ->second += 1; + label_pnrefs_nblocks[label] + .upper_bound(percent_accesses_for_existing_keys) + ->second += 1; + label_pndatasize_nblocks[label] + .upper_bound(percent_referenced_data_size) + ->second += 1; + nblocks += 1; + }; + TraverseBlocks(block_callback, &labels); + WriteStatsToFile(label_str, percent_buckets, kFileNameSuffixPercentRefKeys, + label_pnrefkeys_nblocks, nblocks); + WriteStatsToFile(label_str, percent_buckets, + kFileNameSuffixPercentAccessesOnRefKeys, + label_pnrefs_nblocks, nblocks); + WriteStatsToFile(label_str, percent_buckets, + kFileNameSuffixPercentDataSizeOnRefKeys, + label_pndatasize_nblocks, nblocks); +} + +void BlockCacheTraceAnalyzer::WriteAccessTimeline(const std::string& label_str, + uint64_t time_unit, + bool user_access_only) const { + std::set<std::string> labels = ParseLabelStr(label_str); + uint64_t start_time = std::numeric_limits<uint64_t>::max(); + uint64_t end_time = 0; + std::map<std::string, std::map<uint64_t, uint64_t>> label_access_timeline; + std::map<uint64_t, std::vector<std::string>> access_count_block_id_map; + + auto block_callback = [&](const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + const std::string& /*block_key*/, uint64_t block_id, + const BlockAccessInfo& block) { + uint64_t naccesses = 0; + for (auto const& timeline : block.caller_num_accesses_timeline) { + const TableReaderCaller caller = timeline.first; + if (user_access_only && !is_user_access(caller)) { + continue; + } + const std::string label = + BuildLabel(labels, cf_name, fd, level, type, caller, block_id, block); + for (auto const& naccess : timeline.second) { + const uint64_t timestamp = naccess.first / time_unit; + const uint64_t num = naccess.second; + label_access_timeline[label][timestamp] += num; + start_time = std::min(start_time, timestamp); + end_time = std::max(end_time, timestamp); + naccesses += num; + } + } + if (naccesses > 0) { + access_count_block_id_map[naccesses].push_back(std::to_string(block_id)); + } + }; + TraverseBlocks(block_callback, &labels); + + // We have label_access_timeline now. Write them into a file. + const std::string user_access_prefix = + user_access_only ? "user_access_only_" : "all_access_"; + const std::string output_path = output_dir_ + "/" + user_access_prefix + + label_str + "_" + std::to_string(time_unit) + + "_" + kFileNameSuffixAccessTimeline; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("time"); + if (labels.find("block") != labels.end()) { + for (uint64_t now = start_time; now <= end_time; now++) { + header += ","; + header += std::to_string(now); + } + out << header << std::endl; + // Write the most frequently accessed blocks first. + for (auto naccess_it = access_count_block_id_map.rbegin(); + naccess_it != access_count_block_id_map.rend(); naccess_it++) { + for (auto& block_id_it : naccess_it->second) { + std::string row(block_id_it); + for (uint64_t now = start_time; now <= end_time; now++) { + auto it = label_access_timeline[block_id_it].find(now); + row += ","; + if (it != label_access_timeline[block_id_it].end()) { + row += std::to_string(it->second); + } else { + row += "0"; + } + } + out << row << std::endl; + } + } + out.close(); + return; + } + for (uint64_t now = start_time; now <= end_time; now++) { + header += ","; + header += std::to_string(now); + } + out << header << std::endl; + for (auto const& label : label_access_timeline) { + std::string row(label.first); + for (uint64_t now = start_time; now <= end_time; now++) { + auto it = label.second.find(now); + row += ","; + if (it != label.second.end()) { + row += std::to_string(it->second); + } else { + row += "0"; + } + } + out << row << std::endl; + } + + out.close(); +} + +void BlockCacheTraceAnalyzer::WriteReuseDistance( + const std::string& label_str, + const std::vector<uint64_t>& distance_buckets) const { + std::set<std::string> labels = ParseLabelStr(label_str); + std::map<std::string, std::map<uint64_t, uint64_t>> label_distance_num_reuses; + uint64_t total_num_reuses = 0; + auto block_callback = [&](const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + const std::string& /*block_key*/, uint64_t block_id, + const BlockAccessInfo& block) { + const std::string label = BuildLabel( + labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block); + if (label_distance_num_reuses.find(label) == + label_distance_num_reuses.end()) { + // The first time we encounter this label. + for (auto const& distance_bucket : distance_buckets) { + label_distance_num_reuses[label][distance_bucket] = 0; + } + } + for (auto const& reuse_distance : block.reuse_distance_count) { + label_distance_num_reuses[label] + .upper_bound(reuse_distance.first) + ->second += reuse_distance.second; + total_num_reuses += reuse_distance.second; + } + }; + TraverseBlocks(block_callback, &labels); + // We have label_naccesses and label_distance_num_reuses now. Write them into + // a file. + const std::string output_path = + output_dir_ + "/" + label_str + "_reuse_distance"; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("bucket"); + for (auto const& label_it : label_distance_num_reuses) { + header += ","; + header += label_it.first; + } + out << header << std::endl; + for (auto const& bucket : distance_buckets) { + std::string row(std::to_string(bucket)); + for (auto const& label_it : label_distance_num_reuses) { + auto const& it = label_it.second.find(bucket); + assert(it != label_it.second.end()); + row += ","; + row += std::to_string(percent(it->second, total_num_reuses)); + } + out << row << std::endl; + } + out.close(); +} + +void BlockCacheTraceAnalyzer::UpdateReuseIntervalStats( + const std::string& label, const std::vector<uint64_t>& time_buckets, + const std::map<uint64_t, uint64_t> timeline, + std::map<std::string, std::map<uint64_t, uint64_t>>* label_time_num_reuses, + uint64_t* total_num_reuses) const { + assert(label_time_num_reuses); + assert(total_num_reuses); + if (label_time_num_reuses->find(label) == label_time_num_reuses->end()) { + // The first time we encounter this label. + for (auto const& time_bucket : time_buckets) { + (*label_time_num_reuses)[label][time_bucket] = 0; + } + } + auto it = timeline.begin(); + uint64_t prev_timestamp = it->first; + const uint64_t prev_num = it->second; + it++; + // Reused within one second. + if (prev_num > 1) { + (*label_time_num_reuses)[label].upper_bound(0)->second += prev_num - 1; + *total_num_reuses += prev_num - 1; + } + while (it != timeline.end()) { + const uint64_t timestamp = it->first; + const uint64_t num = it->second; + const uint64_t reuse_interval = timestamp - prev_timestamp; + (*label_time_num_reuses)[label].upper_bound(reuse_interval)->second += 1; + if (num > 1) { + (*label_time_num_reuses)[label].upper_bound(0)->second += num - 1; + } + prev_timestamp = timestamp; + *total_num_reuses += num; + it++; + } +} + +void BlockCacheTraceAnalyzer::WriteStatsToFile( + const std::string& label_str, const std::vector<uint64_t>& time_buckets, + const std::string& filename_suffix, + const std::map<std::string, std::map<uint64_t, uint64_t>>& label_data, + uint64_t ntotal) const { + const std::string output_path = + output_dir_ + "/" + label_str + "_" + filename_suffix; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("bucket"); + for (auto const& label_it : label_data) { + header += ","; + header += label_it.first; + } + out << header << std::endl; + for (auto const& bucket : time_buckets) { + std::string row(std::to_string(bucket)); + for (auto const& label_it : label_data) { + auto const& it = label_it.second.find(bucket); + assert(it != label_it.second.end()); + row += ","; + row += std::to_string(percent(it->second, ntotal)); + } + out << row << std::endl; + } + out.close(); +} + +void BlockCacheTraceAnalyzer::WriteReuseInterval( + const std::string& label_str, + const std::vector<uint64_t>& time_buckets) const { + std::set<std::string> labels = ParseLabelStr(label_str); + std::map<std::string, std::map<uint64_t, uint64_t>> label_time_num_reuses; + std::map<std::string, std::map<uint64_t, uint64_t>> label_avg_reuse_nblocks; + std::map<std::string, std::map<uint64_t, uint64_t>> label_avg_reuse_naccesses; + + uint64_t total_num_reuses = 0; + uint64_t total_nblocks = 0; + uint64_t total_accesses = 0; + auto block_callback = [&](const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + const std::string& /*block_key*/, uint64_t block_id, + const BlockAccessInfo& block) { + total_nblocks++; + total_accesses += block.num_accesses; + uint64_t avg_reuse_interval = 0; + if (block.num_accesses > 1) { + avg_reuse_interval = ((block.last_access_time - block.first_access_time) / + kMicrosInSecond) / + block.num_accesses; + } else { + avg_reuse_interval = std::numeric_limits<uint64_t>::max() - 1; + } + if (labels.find(kGroupbyCaller) != labels.end()) { + for (auto const& timeline : block.caller_num_accesses_timeline) { + const TableReaderCaller caller = timeline.first; + const std::string label = BuildLabel(labels, cf_name, fd, level, type, + caller, block_id, block); + UpdateReuseIntervalStats(label, time_buckets, timeline.second, + &label_time_num_reuses, &total_num_reuses); + } + return; + } + // Does not group by caller so we need to flatten the access timeline. + const std::string label = BuildLabel( + labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block); + std::map<uint64_t, uint64_t> timeline; + for (auto const& caller_timeline : block.caller_num_accesses_timeline) { + for (auto const& time_naccess : caller_timeline.second) { + timeline[time_naccess.first] += time_naccess.second; + } + } + UpdateReuseIntervalStats(label, time_buckets, timeline, + &label_time_num_reuses, &total_num_reuses); + if (label_avg_reuse_nblocks.find(label) == label_avg_reuse_nblocks.end()) { + for (auto const& time_bucket : time_buckets) { + label_avg_reuse_nblocks[label][time_bucket] = 0; + label_avg_reuse_naccesses[label][time_bucket] = 0; + } + } + label_avg_reuse_nblocks[label].upper_bound(avg_reuse_interval)->second += 1; + label_avg_reuse_naccesses[label].upper_bound(avg_reuse_interval)->second += + block.num_accesses; + }; + TraverseBlocks(block_callback, &labels); + + // Write the stats into files. + WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseInterval, + label_time_num_reuses, total_num_reuses); + WriteStatsToFile(label_str, time_buckets, kFileNameSuffixAvgReuseInterval, + label_avg_reuse_nblocks, total_nblocks); + WriteStatsToFile(label_str, time_buckets, + kFileNameSuffixAvgReuseIntervalNaccesses, + label_avg_reuse_naccesses, total_accesses); +} + +void BlockCacheTraceAnalyzer::WriteReuseLifetime( + const std::string& label_str, + const std::vector<uint64_t>& time_buckets) const { + std::set<std::string> labels = ParseLabelStr(label_str); + std::map<std::string, std::map<uint64_t, uint64_t>> label_lifetime_nblocks; + uint64_t total_nblocks = 0; + auto block_callback = [&](const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + const std::string& /*block_key*/, uint64_t block_id, + const BlockAccessInfo& block) { + uint64_t lifetime = 0; + if (block.num_accesses > 1) { + lifetime = + (block.last_access_time - block.first_access_time) / kMicrosInSecond; + } else { + lifetime = std::numeric_limits<uint64_t>::max() - 1; + } + const std::string label = BuildLabel( + labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block); + + if (label_lifetime_nblocks.find(label) == label_lifetime_nblocks.end()) { + // The first time we encounter this label. + for (auto const& time_bucket : time_buckets) { + label_lifetime_nblocks[label][time_bucket] = 0; + } + } + label_lifetime_nblocks[label].upper_bound(lifetime)->second += 1; + total_nblocks += 1; + }; + TraverseBlocks(block_callback, &labels); + WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseLifetime, + label_lifetime_nblocks, total_nblocks); +} + +void BlockCacheTraceAnalyzer::WriteBlockReuseTimeline( + const uint64_t reuse_window, bool user_access_only, + TraceType block_type) const { + // A map from block key to an array of bools that states whether a block is + // accessed in a time window. + std::map<uint64_t, std::vector<bool>> block_accessed; + const uint64_t trace_duration = + trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_; + const uint64_t reuse_vector_size = (trace_duration / reuse_window); + if (reuse_vector_size < 2) { + // The reuse window is less than 2. We cannot calculate the reused + // percentage of blocks. + return; + } + auto block_callback = [&](const std::string& /*cf_name*/, uint64_t /*fd*/, + uint32_t /*level*/, TraceType /*type*/, + const std::string& /*block_key*/, uint64_t block_id, + const BlockAccessInfo& block) { + if (block_accessed.find(block_id) == block_accessed.end()) { + block_accessed[block_id].resize(reuse_vector_size); + for (uint64_t i = 0; i < reuse_vector_size; i++) { + block_accessed[block_id][i] = false; + } + } + for (auto const& caller_num : block.caller_num_accesses_timeline) { + const TableReaderCaller caller = caller_num.first; + for (auto const& timeline : caller_num.second) { + const uint64_t timestamp = timeline.first; + const uint64_t elapsed_time = + timestamp - trace_start_timestamp_in_seconds_; + if (!user_access_only || is_user_access(caller)) { + uint64_t index = + std::min(elapsed_time / reuse_window, reuse_vector_size - 1); + block_accessed[block_id][index] = true; + } + } + } + }; + TraverseBlocks(block_callback); + + // A cell is the number of blocks accessed in a reuse window. + std::unique_ptr<uint64_t[]> reuse_table( + new uint64_t[reuse_vector_size * reuse_vector_size]); + for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) { + // Initialize the reuse_table. + for (uint64_t i = 0; i < reuse_vector_size; i++) { + reuse_table[start_time * reuse_vector_size + i] = 0; + } + // Examine all blocks. + for (auto const& block : block_accessed) { + for (uint64_t i = start_time; i < reuse_vector_size; i++) { + if (block.second[start_time] && block.second[i]) { + // This block is accessed at start time and at the current time. We + // increment reuse_table[start_time][i] since it is reused at the ith + // window. + reuse_table[start_time * reuse_vector_size + i]++; + } + } + } + } + const std::string user_access_prefix = + user_access_only ? "_user_access_only_" : "_all_access_"; + const std::string output_path = + output_dir_ + "/" + block_type_to_string(block_type) + + user_access_prefix + std::to_string(reuse_window) + "_" + + kFileNameSuffixAccessReuseBlocksTimeline; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("start_time"); + for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) { + header += ","; + header += std::to_string(start_time); + } + out << header << std::endl; + for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) { + std::string row(std::to_string(start_time * reuse_window)); + for (uint64_t j = 0; j < reuse_vector_size; j++) { + row += ","; + if (j < start_time) { + row += "100.0"; + } else { + row += std::to_string( + percent(reuse_table[start_time * reuse_vector_size + j], + reuse_table[start_time * reuse_vector_size + start_time])); + } + } + out << row << std::endl; + } + out.close(); +} + +std::string BlockCacheTraceAnalyzer::OutputPercentAccessStats( + uint64_t total_accesses, + const std::map<std::string, uint64_t>& cf_access_count) const { + std::string row; + for (auto const& cf_aggregates : cf_aggregates_map_) { + const std::string& cf_name = cf_aggregates.first; + const auto& naccess = cf_access_count.find(cf_name); + row += ","; + if (naccess != cf_access_count.end()) { + row += std::to_string(percent(naccess->second, total_accesses)); + } else { + row += "0"; + } + } + return row; +} + +void BlockCacheTraceAnalyzer::WritePercentAccessSummaryStats() const { + std::map<TableReaderCaller, std::map<std::string, uint64_t>> + caller_cf_accesses; + uint64_t total_accesses = 0; + auto block_callback = + [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/, + TraceType /*type*/, const std::string& /*block_key*/, + uint64_t /*block_id*/, const BlockAccessInfo& block) { + for (auto const& caller_num : block.caller_num_access_map) { + const TableReaderCaller caller = caller_num.first; + const uint64_t naccess = caller_num.second; + caller_cf_accesses[caller][cf_name] += naccess; + total_accesses += naccess; + } + }; + TraverseBlocks(block_callback); + + const std::string output_path = + output_dir_ + "/" + kFileNameSuffixPercentOfAccessSummary; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("caller"); + for (auto const& cf_name : cf_aggregates_map_) { + header += ","; + header += cf_name.first; + } + out << header << std::endl; + for (auto const& cf_naccess_it : caller_cf_accesses) { + const TableReaderCaller caller = cf_naccess_it.first; + std::string row; + row += caller_to_string(caller); + row += OutputPercentAccessStats(total_accesses, cf_naccess_it.second); + out << row << std::endl; + } + out.close(); +} + +void BlockCacheTraceAnalyzer::WriteDetailedPercentAccessSummaryStats( + TableReaderCaller analyzing_caller) const { + std::map<uint32_t, std::map<std::string, uint64_t>> level_cf_accesses; + std::map<TraceType, std::map<std::string, uint64_t>> bt_cf_accesses; + uint64_t total_accesses = 0; + auto block_callback = + [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t level, + TraceType type, const std::string& /*block_key*/, + uint64_t /*block_id*/, const BlockAccessInfo& block) { + for (auto const& caller_num : block.caller_num_access_map) { + const TableReaderCaller caller = caller_num.first; + if (caller == analyzing_caller) { + const uint64_t naccess = caller_num.second; + level_cf_accesses[level][cf_name] += naccess; + bt_cf_accesses[type][cf_name] += naccess; + total_accesses += naccess; + } + } + }; + TraverseBlocks(block_callback); + { + const std::string output_path = + output_dir_ + "/" + caller_to_string(analyzing_caller) + "_level_" + + kFileNameSuffixPercentOfAccessSummary; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("level"); + for (auto const& cf_name : cf_aggregates_map_) { + header += ","; + header += cf_name.first; + } + out << header << std::endl; + for (auto const& level_naccess_it : level_cf_accesses) { + const uint32_t level = level_naccess_it.first; + std::string row; + row += std::to_string(level); + row += OutputPercentAccessStats(total_accesses, level_naccess_it.second); + out << row << std::endl; + } + out.close(); + } + { + const std::string output_path = + output_dir_ + "/" + caller_to_string(analyzing_caller) + "_bt_" + + kFileNameSuffixPercentOfAccessSummary; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("bt"); + for (auto const& cf_name : cf_aggregates_map_) { + header += ","; + header += cf_name.first; + } + out << header << std::endl; + for (auto const& bt_naccess_it : bt_cf_accesses) { + const TraceType bt = bt_naccess_it.first; + std::string row; + row += block_type_to_string(bt); + row += OutputPercentAccessStats(total_accesses, bt_naccess_it.second); + out << row << std::endl; + } + out.close(); + } +} + +void BlockCacheTraceAnalyzer::WriteAccessCountSummaryStats( + const std::vector<uint64_t>& access_count_buckets, + bool user_access_only) const { + // x: buckets. + // y: # of accesses. + std::map<std::string, std::map<uint64_t, uint64_t>> bt_access_nblocks; + std::map<std::string, std::map<uint64_t, uint64_t>> cf_access_nblocks; + uint64_t total_nblocks = 0; + auto block_callback = + [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/, + TraceType type, const std::string& /*block_key*/, + uint64_t /*block_id*/, const BlockAccessInfo& block) { + const std::string type_str = block_type_to_string(type); + if (cf_access_nblocks.find(cf_name) == cf_access_nblocks.end()) { + // initialize. + for (auto& access : access_count_buckets) { + cf_access_nblocks[cf_name][access] = 0; + } + } + if (bt_access_nblocks.find(type_str) == bt_access_nblocks.end()) { + // initialize. + for (auto& access : access_count_buckets) { + bt_access_nblocks[type_str][access] = 0; + } + } + uint64_t naccesses = 0; + for (auto const& caller_access : block.caller_num_access_map) { + if (!user_access_only || is_user_access(caller_access.first)) { + naccesses += caller_access.second; + } + } + if (naccesses == 0) { + return; + } + total_nblocks += 1; + bt_access_nblocks[type_str].upper_bound(naccesses)->second += 1; + cf_access_nblocks[cf_name].upper_bound(naccesses)->second += 1; + }; + TraverseBlocks(block_callback); + const std::string user_access_prefix = + user_access_only ? "user_access_only_" : "all_access_"; + WriteStatsToFile("cf", access_count_buckets, + user_access_prefix + kFileNameSuffixAccessCountSummary, + cf_access_nblocks, total_nblocks); + WriteStatsToFile("bt", access_count_buckets, + user_access_prefix + kFileNameSuffixAccessCountSummary, + bt_access_nblocks, total_nblocks); +} + +BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer( + const std::string& trace_file_path, const std::string& output_dir, + const std::string& human_readable_trace_file_path, + bool compute_reuse_distance, bool mrc_only, + bool is_human_readable_trace_file, + std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator) + : env_(ROCKSDB_NAMESPACE::Env::Default()), + trace_file_path_(trace_file_path), + output_dir_(output_dir), + human_readable_trace_file_path_(human_readable_trace_file_path), + compute_reuse_distance_(compute_reuse_distance), + mrc_only_(mrc_only), + is_human_readable_trace_file_(is_human_readable_trace_file), + cache_simulator_(std::move(cache_simulator)) {} + +void BlockCacheTraceAnalyzer::ComputeReuseDistance( + BlockAccessInfo* info) const { + assert(info); + if (info->num_accesses == 0) { + return; + } + uint64_t reuse_distance = 0; + for (auto const& block_key : info->unique_blocks_since_last_access) { + auto const& it = block_info_map_.find(block_key); + // This block must exist. + assert(it != block_info_map_.end()); + reuse_distance += it->second->block_size; + } + info->reuse_distance_count[reuse_distance] += 1; + // We clear this hash set since this is the second access on this block. + info->unique_blocks_since_last_access.clear(); +} + +Status BlockCacheTraceAnalyzer::RecordAccess( + const BlockCacheTraceRecord& access) { + ColumnFamilyAccessInfoAggregate& cf_aggr = cf_aggregates_map_[access.cf_name]; + SSTFileAccessInfoAggregate& file_aggr = + cf_aggr.fd_aggregates_map[access.sst_fd_number]; + file_aggr.level = access.level; + BlockTypeAccessInfoAggregate& block_type_aggr = + file_aggr.block_type_aggregates_map[access.block_type]; + if (block_type_aggr.block_access_info_map.find(access.block_key) == + block_type_aggr.block_access_info_map.end()) { + block_type_aggr.block_access_info_map[access.block_key].block_id = + unique_block_id_; + unique_block_id_++; + } + BlockAccessInfo& block_access_info = + block_type_aggr.block_access_info_map[access.block_key]; + if (compute_reuse_distance_) { + ComputeReuseDistance(&block_access_info); + } + block_access_info.AddAccess(access, access_sequence_number_); + block_info_map_[access.block_key] = &block_access_info; + uint64_t get_key_id = 0; + if (access.caller == TableReaderCaller::kUserGet && + access.get_id != BlockCacheTraceHelper::kReservedGetId) { + std::string user_key = ExtractUserKey(access.referenced_key).ToString(); + if (get_key_info_map_.find(user_key) == get_key_info_map_.end()) { + get_key_info_map_[user_key].key_id = unique_get_key_id_; + unique_get_key_id_++; + } + get_key_id = get_key_info_map_[user_key].key_id; + get_key_info_map_[user_key].AddAccess(access, access_sequence_number_); + } + + if (compute_reuse_distance_) { + // Add this block to all existing blocks. + for (auto& cf_aggregates : cf_aggregates_map_) { + for (auto& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + for (auto& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + for (auto& existing_block : + block_type_aggregates.second.block_access_info_map) { + existing_block.second.unique_blocks_since_last_access.insert( + access.block_key); + } + } + } + } + } + return human_readable_trace_writer_.WriteHumanReadableTraceRecord( + access, block_access_info.block_id, get_key_id); +} + +Status BlockCacheTraceAnalyzer::Analyze() { + SystemClock* clock = env_->GetSystemClock().get(); + std::unique_ptr<BlockCacheTraceReader> reader; + Status s = Status::OK(); + if (is_human_readable_trace_file_) { + reader.reset(new BlockCacheHumanReadableTraceReader(trace_file_path_)); + } else { + std::unique_ptr<TraceReader> trace_reader; + s = NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader); + if (!s.ok()) { + return s; + } + reader.reset(new BlockCacheTraceReader(std::move(trace_reader))); + s = reader->ReadHeader(&header_); + if (!s.ok()) { + return s; + } + } + if (!human_readable_trace_file_path_.empty()) { + s = human_readable_trace_writer_.NewWritableFile( + human_readable_trace_file_path_, env_); + if (!s.ok()) { + return s; + } + } + uint64_t start = clock->NowMicros(); + uint64_t time_interval = 0; + while (s.ok()) { + BlockCacheTraceRecord access; + s = reader->ReadAccess(&access); + if (!s.ok()) { + break; + } + if (!mrc_only_) { + s = RecordAccess(access); + if (!s.ok()) { + break; + } + } + if (trace_start_timestamp_in_seconds_ == 0) { + trace_start_timestamp_in_seconds_ = + access.access_timestamp / kMicrosInSecond; + } + trace_end_timestamp_in_seconds_ = access.access_timestamp / kMicrosInSecond; + miss_ratio_stats_.UpdateMetrics(access.access_timestamp, + is_user_access(access.caller), + !access.is_cache_hit); + if (cache_simulator_) { + cache_simulator_->Access(access); + } + access_sequence_number_++; + uint64_t now = clock->NowMicros(); + uint64_t duration = (now - start) / kMicrosInSecond; + if (duration > 10 * time_interval) { + uint64_t trace_duration = + trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_; + fprintf(stdout, + "Running for %" PRIu64 " seconds: Processed %" PRIu64 + " records/second. Trace duration %" PRIu64 + " seconds. Observed miss ratio %.2f\n", + duration, duration > 0 ? access_sequence_number_ / duration : 0, + trace_duration, miss_ratio_stats_.miss_ratio()); + time_interval++; + } + } + uint64_t now = clock->NowMicros(); + uint64_t duration = (now - start) / kMicrosInSecond; + uint64_t trace_duration = + trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_; + fprintf(stdout, + "Running for %" PRIu64 " seconds: Processed %" PRIu64 + " records/second. Trace duration %" PRIu64 + " seconds. Observed miss ratio %.2f\n", + duration, duration > 0 ? access_sequence_number_ / duration : 0, + trace_duration, miss_ratio_stats_.miss_ratio()); + return s; +} + +void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const { + HistogramStat bs_stats; + std::map<TraceType, HistogramStat> bt_stats_map; + std::map<std::string, std::map<TraceType, HistogramStat>> cf_bt_stats_map; + auto block_callback = + [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/, + TraceType type, const std::string& /*block_key*/, + uint64_t /*block_id*/, const BlockAccessInfo& block) { + if (block.block_size == 0) { + // Block size may be 0 when 1) compaction observes a cache miss and + // does not insert the missing block into the cache again. 2) + // fetching filter blocks in SST files at the last level. + return; + } + bs_stats.Add(block.block_size); + bt_stats_map[type].Add(block.block_size); + cf_bt_stats_map[cf_name][type].Add(block.block_size); + }; + TraverseBlocks(block_callback); + fprintf(stdout, "Block size stats: \n%s", bs_stats.ToString().c_str()); + for (auto const& bt_stats : bt_stats_map) { + print_break_lines(/*num_break_lines=*/1); + fprintf(stdout, "Block size stats for block type %s: \n%s", + block_type_to_string(bt_stats.first).c_str(), + bt_stats.second.ToString().c_str()); + } + for (auto const& cf_bt_stats : cf_bt_stats_map) { + const std::string& cf_name = cf_bt_stats.first; + for (auto const& bt_stats : cf_bt_stats.second) { + print_break_lines(/*num_break_lines=*/1); + fprintf(stdout, + "Block size stats for column family %s and block type %s: \n%s", + cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(), + bt_stats.second.ToString().c_str()); + } + } +} + +void BlockCacheTraceAnalyzer::PrintAccessCountStats(bool user_access_only, + uint32_t bottom_k, + uint32_t top_k) const { + HistogramStat access_stats; + std::map<TraceType, HistogramStat> bt_stats_map; + std::map<std::string, std::map<TraceType, HistogramStat>> cf_bt_stats_map; + std::map<uint64_t, std::vector<std::string>> access_count_blocks; + auto block_callback = [&](const std::string& cf_name, uint64_t /*fd*/, + uint32_t /*level*/, TraceType type, + const std::string& block_key, uint64_t /*block_id*/, + const BlockAccessInfo& block) { + uint64_t naccesses = 0; + for (auto const& caller_access : block.caller_num_access_map) { + if (!user_access_only || is_user_access(caller_access.first)) { + naccesses += caller_access.second; + } + } + if (naccesses == 0) { + return; + } + if (type == TraceType::kBlockTraceDataBlock) { + access_count_blocks[naccesses].push_back(block_key); + } + access_stats.Add(naccesses); + bt_stats_map[type].Add(naccesses); + cf_bt_stats_map[cf_name][type].Add(naccesses); + }; + TraverseBlocks(block_callback); + fprintf(stdout, + "Block access count stats: The number of accesses per block. %s\n%s", + user_access_only ? "User accesses only" : "All accesses", + access_stats.ToString().c_str()); + uint32_t bottom_k_index = 0; + for (auto naccess_it = access_count_blocks.begin(); + naccess_it != access_count_blocks.end(); naccess_it++) { + bottom_k_index++; + if (bottom_k_index >= bottom_k) { + break; + } + std::map<TableReaderCaller, uint64_t> caller_naccesses; + uint64_t naccesses = 0; + for (auto const& block_id : naccess_it->second) { + BlockAccessInfo* block = block_info_map_.find(block_id)->second; + for (auto const& caller_access : block->caller_num_access_map) { + if (!user_access_only || is_user_access(caller_access.first)) { + caller_naccesses[caller_access.first] += caller_access.second; + naccesses += caller_access.second; + } + } + } + std::string statistics("Caller:"); + for (auto const& caller_naccessess_it : caller_naccesses) { + statistics += caller_to_string(caller_naccessess_it.first); + statistics += ":"; + statistics += + std::to_string(percent(caller_naccessess_it.second, naccesses)); + statistics += ","; + } + fprintf(stdout, + "Bottom %" PRIu32 " access count. Access count=%" PRIu64 + " nblocks=%" ROCKSDB_PRIszt " %s\n", + bottom_k, naccess_it->first, naccess_it->second.size(), + statistics.c_str()); + } + + uint32_t top_k_index = 0; + for (auto naccess_it = access_count_blocks.rbegin(); + naccess_it != access_count_blocks.rend(); naccess_it++) { + top_k_index++; + if (top_k_index >= top_k) { + break; + } + for (auto const& block_id : naccess_it->second) { + BlockAccessInfo* block = block_info_map_.find(block_id)->second; + std::string statistics("Caller:"); + uint64_t naccesses = 0; + for (auto const& caller_access : block->caller_num_access_map) { + if (!user_access_only || is_user_access(caller_access.first)) { + naccesses += caller_access.second; + } + } + assert(naccesses > 0); + for (auto const& caller_access : block->caller_num_access_map) { + if (!user_access_only || is_user_access(caller_access.first)) { + statistics += ","; + statistics += caller_to_string(caller_access.first); + statistics += ":"; + statistics += + std::to_string(percent(caller_access.second, naccesses)); + } + } + uint64_t ref_keys_accesses = 0; + uint64_t ref_keys_does_not_exist_accesses = 0; + for (auto const& ref_key_caller_access : block->key_num_access_map) { + for (auto const& caller_access : ref_key_caller_access.second) { + if (!user_access_only || is_user_access(caller_access.first)) { + ref_keys_accesses += caller_access.second; + } + } + } + for (auto const& ref_key_caller_access : + block->non_exist_key_num_access_map) { + for (auto const& caller_access : ref_key_caller_access.second) { + if (!user_access_only || is_user_access(caller_access.first)) { + ref_keys_does_not_exist_accesses += caller_access.second; + } + } + } + statistics += ",nkeys="; + statistics += std::to_string(block->num_keys); + statistics += ",block_size="; + statistics += std::to_string(block->block_size); + statistics += ",num_ref_keys="; + statistics += std::to_string(block->key_num_access_map.size()); + statistics += ",percent_access_ref_keys="; + statistics += std::to_string(percent(ref_keys_accesses, naccesses)); + statistics += ",num_ref_keys_does_not_exist="; + statistics += std::to_string(block->non_exist_key_num_access_map.size()); + statistics += ",percent_access_ref_keys_does_not_exist="; + statistics += + std::to_string(percent(ref_keys_does_not_exist_accesses, naccesses)); + statistics += ",ref_data_size="; + statistics += std::to_string(block->referenced_data_size); + fprintf(stdout, + "Top %" PRIu32 " access count blocks access_count=%" PRIu64 + " %s\n", + top_k, naccess_it->first, statistics.c_str()); + } + } + + for (auto const& bt_stats : bt_stats_map) { + print_break_lines(/*num_break_lines=*/1); + fprintf(stdout, "Break down by block type %s: \n%s", + block_type_to_string(bt_stats.first).c_str(), + bt_stats.second.ToString().c_str()); + } + for (auto const& cf_bt_stats : cf_bt_stats_map) { + const std::string& cf_name = cf_bt_stats.first; + for (auto const& bt_stats : cf_bt_stats.second) { + print_break_lines(/*num_break_lines=*/1); + fprintf(stdout, + "Break down by column family %s and block type " + "%s: \n%s", + cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(), + bt_stats.second.ToString().c_str()); + } + } +} + +void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const { + HistogramStat existing_keys_stats; + std::map<std::string, HistogramStat> cf_existing_keys_stats_map; + HistogramStat non_existing_keys_stats; + std::map<std::string, HistogramStat> cf_non_existing_keys_stats_map; + HistogramStat block_access_stats; + std::map<std::string, HistogramStat> cf_block_access_info; + HistogramStat percent_referenced_bytes; + std::map<std::string, HistogramStat> cf_percent_referenced_bytes; + // Total number of accesses in a data block / number of keys in a data block. + HistogramStat avg_naccesses_per_key_in_a_data_block; + std::map<std::string, HistogramStat> cf_avg_naccesses_per_key_in_a_data_block; + // The standard deviation on the number of accesses of a key in a data block. + HistogramStat stdev_naccesses_per_key_in_a_data_block; + std::map<std::string, HistogramStat> + cf_stdev_naccesses_per_key_in_a_data_block; + auto block_callback = + [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/, + TraceType /*type*/, const std::string& /*block_key*/, + uint64_t /*block_id*/, const BlockAccessInfo& block) { + if (block.num_keys == 0) { + return; + } + // Use four decimal points. + uint64_t percent_referenced_for_existing_keys = + (uint64_t)(((double)block.key_num_access_map.size() / + (double)block.num_keys) * + 10000.0); + uint64_t percent_referenced_for_non_existing_keys = + (uint64_t)(((double)block.non_exist_key_num_access_map.size() / + (double)block.num_keys) * + 10000.0); + uint64_t percent_accesses_for_existing_keys = + (uint64_t)(((double)block.num_referenced_key_exist_in_block / + (double)block.num_accesses) * + 10000.0); + + HistogramStat hist_naccess_per_key; + for (auto const& key_access : block.key_num_access_map) { + for (auto const& caller_access : key_access.second) { + hist_naccess_per_key.Add(caller_access.second); + } + } + uint64_t avg_accesses = + static_cast<uint64_t>(hist_naccess_per_key.Average()); + uint64_t stdev_accesses = + static_cast<uint64_t>(hist_naccess_per_key.StandardDeviation()); + avg_naccesses_per_key_in_a_data_block.Add(avg_accesses); + cf_avg_naccesses_per_key_in_a_data_block[cf_name].Add(avg_accesses); + stdev_naccesses_per_key_in_a_data_block.Add(stdev_accesses); + cf_stdev_naccesses_per_key_in_a_data_block[cf_name].Add(stdev_accesses); + + existing_keys_stats.Add(percent_referenced_for_existing_keys); + cf_existing_keys_stats_map[cf_name].Add( + percent_referenced_for_existing_keys); + non_existing_keys_stats.Add(percent_referenced_for_non_existing_keys); + cf_non_existing_keys_stats_map[cf_name].Add( + percent_referenced_for_non_existing_keys); + block_access_stats.Add(percent_accesses_for_existing_keys); + cf_block_access_info[cf_name].Add(percent_accesses_for_existing_keys); + }; + TraverseBlocks(block_callback); + fprintf(stdout, + "Histogram on the number of referenced keys existing in a block over " + "the total number of keys in a block: \n%s", + existing_keys_stats.ToString().c_str()); + for (auto const& cf_stats : cf_existing_keys_stats_map) { + print_break_lines(/*num_break_lines=*/1); + fprintf(stdout, "Break down by column family %s: \n%s", + cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); + } + print_break_lines(/*num_break_lines=*/1); + fprintf( + stdout, + "Histogram on the number of referenced keys DO NOT exist in a block over " + "the total number of keys in a block: \n%s", + non_existing_keys_stats.ToString().c_str()); + for (auto const& cf_stats : cf_non_existing_keys_stats_map) { + print_break_lines(/*num_break_lines=*/1); + fprintf(stdout, "Break down by column family %s: \n%s", + cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); + } + print_break_lines(/*num_break_lines=*/1); + fprintf(stdout, + "Histogram on the number of accesses on keys exist in a block over " + "the total number of accesses in a block: \n%s", + block_access_stats.ToString().c_str()); + for (auto const& cf_stats : cf_block_access_info) { + print_break_lines(/*num_break_lines=*/1); + fprintf(stdout, "Break down by column family %s: \n%s", + cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); + } + print_break_lines(/*num_break_lines=*/1); + fprintf( + stdout, + "Histogram on the average number of accesses per key in a block: \n%s", + avg_naccesses_per_key_in_a_data_block.ToString().c_str()); + for (auto const& cf_stats : cf_avg_naccesses_per_key_in_a_data_block) { + fprintf(stdout, "Break down by column family %s: \n%s", + cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); + } + print_break_lines(/*num_break_lines=*/1); + fprintf(stdout, + "Histogram on the standard deviation of the number of accesses per " + "key in a block: \n%s", + stdev_naccesses_per_key_in_a_data_block.ToString().c_str()); + for (auto const& cf_stats : cf_stdev_naccesses_per_key_in_a_data_block) { + fprintf(stdout, "Break down by column family %s: \n%s", + cf_stats.first.c_str(), cf_stats.second.ToString().c_str()); + } +} + +void BlockCacheTraceAnalyzer::PrintStatsSummary() const { + uint64_t total_num_files = 0; + uint64_t total_num_blocks = 0; + uint64_t total_num_accesses = 0; + std::map<TraceType, uint64_t> bt_num_blocks_map; + std::map<TableReaderCaller, uint64_t> caller_num_access_map; + std::map<TableReaderCaller, std::map<TraceType, uint64_t>> + caller_bt_num_access_map; + std::map<TableReaderCaller, std::map<uint32_t, uint64_t>> + caller_level_num_access_map; + for (auto const& cf_aggregates : cf_aggregates_map_) { + // Stats per column family. + const std::string& cf_name = cf_aggregates.first; + uint64_t cf_num_files = 0; + uint64_t cf_num_blocks = 0; + std::map<TraceType, uint64_t> cf_bt_blocks; + uint64_t cf_num_accesses = 0; + std::map<TableReaderCaller, uint64_t> cf_caller_num_accesses_map; + std::map<TableReaderCaller, std::map<uint64_t, uint64_t>> + cf_caller_level_num_accesses_map; + std::map<TableReaderCaller, std::map<uint64_t, uint64_t>> + cf_caller_file_num_accesses_map; + std::map<TableReaderCaller, std::map<TraceType, uint64_t>> + cf_caller_bt_num_accesses_map; + total_num_files += cf_aggregates.second.fd_aggregates_map.size(); + for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + // Stats per SST file. + const uint64_t fd = file_aggregates.first; + const uint32_t level = file_aggregates.second.level; + cf_num_files++; + for (auto const& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + // Stats per block type. + const TraceType type = block_type_aggregates.first; + cf_bt_blocks[type] += + block_type_aggregates.second.block_access_info_map.size(); + total_num_blocks += + block_type_aggregates.second.block_access_info_map.size(); + bt_num_blocks_map[type] += + block_type_aggregates.second.block_access_info_map.size(); + for (auto const& block_access_info : + block_type_aggregates.second.block_access_info_map) { + // Stats per block. + cf_num_blocks++; + for (auto const& stats : + block_access_info.second.caller_num_access_map) { + // Stats per caller. + const TableReaderCaller caller = stats.first; + const uint64_t num_accesses = stats.second; + // Overall stats. + total_num_accesses += num_accesses; + caller_num_access_map[caller] += num_accesses; + caller_bt_num_access_map[caller][type] += num_accesses; + caller_level_num_access_map[caller][level] += num_accesses; + // Column Family stats. + cf_num_accesses += num_accesses; + cf_caller_num_accesses_map[caller] += num_accesses; + cf_caller_level_num_accesses_map[caller][level] += num_accesses; + cf_caller_file_num_accesses_map[caller][fd] += num_accesses; + cf_caller_bt_num_accesses_map[caller][type] += num_accesses; + } + } + } + } + + // Print stats. + print_break_lines(/*num_break_lines=*/3); + fprintf(stdout, "Statistics for column family %s:\n", cf_name.c_str()); + fprintf(stdout, + " Number of files:%" PRIu64 " Number of blocks: %" PRIu64 + " Number of accesses: %" PRIu64 "\n", + cf_num_files, cf_num_blocks, cf_num_accesses); + for (auto block_type : cf_bt_blocks) { + fprintf(stdout, "Number of %s blocks: %" PRIu64 " Percent: %.2f\n", + block_type_to_string(block_type.first).c_str(), block_type.second, + percent(block_type.second, cf_num_blocks)); + } + for (auto caller : cf_caller_num_accesses_map) { + const uint64_t naccesses = caller.second; + print_break_lines(/*num_break_lines=*/1); + fprintf(stdout, + "Caller %s: Number of accesses %" PRIu64 " Percent: %.2f\n", + caller_to_string(caller.first).c_str(), naccesses, + percent(naccesses, cf_num_accesses)); + fprintf(stdout, "Caller %s: Number of accesses per level break down\n", + caller_to_string(caller.first).c_str()); + for (auto naccess_level : + cf_caller_level_num_accesses_map[caller.first]) { + fprintf(stdout, + "\t Level %" PRIu64 ": Number of accesses: %" PRIu64 + " Percent: %.2f\n", + naccess_level.first, naccess_level.second, + percent(naccess_level.second, naccesses)); + } + fprintf(stdout, "Caller %s: Number of accesses per file break down\n", + caller_to_string(caller.first).c_str()); + for (auto naccess_file : cf_caller_file_num_accesses_map[caller.first]) { + fprintf(stdout, + "\t File %" PRIu64 ": Number of accesses: %" PRIu64 + " Percent: %.2f\n", + naccess_file.first, naccess_file.second, + percent(naccess_file.second, naccesses)); + } + fprintf(stdout, + "Caller %s: Number of accesses per block type break down\n", + caller_to_string(caller.first).c_str()); + for (auto naccess_type : cf_caller_bt_num_accesses_map[caller.first]) { + fprintf(stdout, + "\t Block Type %s: Number of accesses: %" PRIu64 + " Percent: %.2f\n", + block_type_to_string(naccess_type.first).c_str(), + naccess_type.second, percent(naccess_type.second, naccesses)); + } + } + } + print_break_lines(/*num_break_lines=*/3); + fprintf(stdout, "Overall statistics:\n"); + fprintf(stdout, + "Number of files: %" PRIu64 " Number of blocks: %" PRIu64 + " Number of accesses: %" PRIu64 "\n", + total_num_files, total_num_blocks, total_num_accesses); + for (auto block_type : bt_num_blocks_map) { + fprintf(stdout, "Number of %s blocks: %" PRIu64 " Percent: %.2f\n", + block_type_to_string(block_type.first).c_str(), block_type.second, + percent(block_type.second, total_num_blocks)); + } + for (auto caller : caller_num_access_map) { + print_break_lines(/*num_break_lines=*/1); + uint64_t naccesses = caller.second; + fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 " Percent: %.2f\n", + caller_to_string(caller.first).c_str(), naccesses, + percent(naccesses, total_num_accesses)); + fprintf(stdout, "Caller %s: Number of accesses per level break down\n", + caller_to_string(caller.first).c_str()); + for (auto naccess_level : caller_level_num_access_map[caller.first]) { + fprintf(stdout, + "\t Level %d: Number of accesses: %" PRIu64 " Percent: %.2f\n", + naccess_level.first, naccess_level.second, + percent(naccess_level.second, naccesses)); + } + fprintf(stdout, "Caller %s: Number of accesses per block type break down\n", + caller_to_string(caller.first).c_str()); + for (auto naccess_type : caller_bt_num_access_map[caller.first]) { + fprintf(stdout, + "\t Block Type %s: Number of accesses: %" PRIu64 + " Percent: %.2f\n", + block_type_to_string(naccess_type.first).c_str(), + naccess_type.second, percent(naccess_type.second, naccesses)); + } + } +} + +std::vector<CacheConfiguration> parse_cache_config_file( + const std::string& config_path) { + std::ifstream file(config_path); + if (!file.is_open()) { + return {}; + } + std::vector<CacheConfiguration> configs; + std::string line; + while (getline(file, line)) { + CacheConfiguration cache_config; + std::stringstream ss(line); + std::vector<std::string> config_strs; + while (ss.good()) { + std::string substr; + getline(ss, substr, ','); + config_strs.push_back(substr); + } + // Sanity checks. + if (config_strs.size() < 4) { + fprintf(stderr, "Invalid cache simulator configuration %s\n", + line.c_str()); + exit(1); + } + if (kSupportedCacheNames.find(" " + config_strs[0] + " ") == + std::string::npos) { + fprintf(stderr, "Invalid cache name %s. Supported cache names are %s\n", + line.c_str(), kSupportedCacheNames.c_str()); + exit(1); + } + cache_config.cache_name = config_strs[0]; + cache_config.num_shard_bits = ParseUint32(config_strs[1]); + cache_config.ghost_cache_capacity = ParseUint64(config_strs[2]); + for (uint32_t i = 3; i < config_strs.size(); i++) { + uint64_t capacity = ParseUint64(config_strs[i]); + if (capacity == 0) { + fprintf(stderr, "Invalid cache capacity %s, %s\n", + config_strs[i].c_str(), line.c_str()); + exit(1); + } + cache_config.cache_capacities.push_back(capacity); + } + configs.push_back(cache_config); + } + file.close(); + return configs; +} + +std::vector<uint64_t> parse_buckets(const std::string& bucket_str) { + std::vector<uint64_t> buckets; + std::stringstream ss(bucket_str); + while (ss.good()) { + std::string bucket; + getline(ss, bucket, ','); + buckets.push_back(ParseUint64(bucket)); + } + buckets.push_back(std::numeric_limits<uint64_t>::max()); + return buckets; +} + +int block_cache_trace_analyzer_tool(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_block_cache_trace_path.empty()) { + fprintf(stderr, "block cache trace path is empty\n"); + exit(1); + } + uint64_t warmup_seconds = + FLAGS_cache_sim_warmup_seconds > 0 ? FLAGS_cache_sim_warmup_seconds : 0; + uint32_t downsample_ratio = FLAGS_block_cache_trace_downsample_ratio > 0 + ? FLAGS_block_cache_trace_downsample_ratio + : 0; + std::vector<CacheConfiguration> cache_configs = + parse_cache_config_file(FLAGS_block_cache_sim_config_path); + std::unique_ptr<BlockCacheTraceSimulator> cache_simulator; + if (!cache_configs.empty()) { + cache_simulator.reset(new BlockCacheTraceSimulator( + warmup_seconds, downsample_ratio, cache_configs)); + Status s = cache_simulator->InitializeCaches(); + if (!s.ok()) { + fprintf(stderr, "Cannot initialize cache simulators %s\n", + s.ToString().c_str()); + exit(1); + } + } + BlockCacheTraceAnalyzer analyzer( + FLAGS_block_cache_trace_path, FLAGS_block_cache_analysis_result_dir, + FLAGS_human_readable_trace_file_path, + !FLAGS_reuse_distance_labels.empty(), FLAGS_mrc_only, + FLAGS_is_block_cache_human_readable_trace, std::move(cache_simulator)); + Status s = analyzer.Analyze(); + if (!s.IsIncomplete() && !s.ok()) { + // Read all traces. + fprintf(stderr, "Cannot process the trace %s\n", s.ToString().c_str()); + exit(1); + } + fprintf(stdout, "Status: %s\n", s.ToString().c_str()); + analyzer.WriteMissRatioCurves(); + analyzer.WriteMissRatioTimeline(1); + analyzer.WriteMissRatioTimeline(kSecondInMinute); + analyzer.WriteMissRatioTimeline(kSecondInHour); + analyzer.WriteMissTimeline(1); + analyzer.WriteMissTimeline(kSecondInMinute); + analyzer.WriteMissTimeline(kSecondInHour); + + if (FLAGS_mrc_only) { + fprintf(stdout, + "Skipping the analysis statistics since the user wants to compute " + "MRC only"); + return 0; + } + + analyzer.PrintStatsSummary(); + if (FLAGS_print_access_count_stats) { + print_break_lines(/*num_break_lines=*/3); + analyzer.PrintAccessCountStats( + /*user_access_only=*/false, FLAGS_analyze_bottom_k_access_count_blocks, + FLAGS_analyze_top_k_access_count_blocks); + print_break_lines(/*num_break_lines=*/3); + analyzer.PrintAccessCountStats( + /*user_access_only=*/true, FLAGS_analyze_bottom_k_access_count_blocks, + FLAGS_analyze_top_k_access_count_blocks); + } + if (FLAGS_print_block_size_stats) { + print_break_lines(/*num_break_lines=*/3); + analyzer.PrintBlockSizeStats(); + } + if (FLAGS_print_data_block_access_count_stats) { + print_break_lines(/*num_break_lines=*/3); + analyzer.PrintDataBlockAccessStats(); + } + print_break_lines(/*num_break_lines=*/3); + + if (!FLAGS_timeline_labels.empty()) { + std::stringstream ss(FLAGS_timeline_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + if (label.find("block") != std::string::npos) { + analyzer.WriteAccessTimeline(label, kSecondInMinute, true); + analyzer.WriteAccessTimeline(label, kSecondInMinute, false); + analyzer.WriteAccessTimeline(label, kSecondInHour, true); + analyzer.WriteAccessTimeline(label, kSecondInHour, false); + } else { + analyzer.WriteAccessTimeline(label, kSecondInMinute, false); + analyzer.WriteAccessTimeline(label, kSecondInHour, false); + } + } + } + + if (!FLAGS_analyze_callers.empty()) { + analyzer.WritePercentAccessSummaryStats(); + std::stringstream ss(FLAGS_analyze_callers); + while (ss.good()) { + std::string caller; + getline(ss, caller, ','); + analyzer.WriteDetailedPercentAccessSummaryStats(string_to_caller(caller)); + } + } + + if (!FLAGS_access_count_buckets.empty()) { + std::vector<uint64_t> buckets = parse_buckets(FLAGS_access_count_buckets); + analyzer.WriteAccessCountSummaryStats(buckets, /*user_access_only=*/true); + analyzer.WriteAccessCountSummaryStats(buckets, /*user_access_only=*/false); + } + + if (!FLAGS_reuse_distance_labels.empty() && + !FLAGS_reuse_distance_buckets.empty()) { + std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_distance_buckets); + std::stringstream ss(FLAGS_reuse_distance_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteReuseDistance(label, buckets); + } + } + + if (!FLAGS_reuse_interval_labels.empty() && + !FLAGS_reuse_interval_buckets.empty()) { + std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_interval_buckets); + std::stringstream ss(FLAGS_reuse_interval_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteReuseInterval(label, buckets); + } + } + + if (!FLAGS_reuse_lifetime_labels.empty() && + !FLAGS_reuse_lifetime_buckets.empty()) { + std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_lifetime_buckets); + std::stringstream ss(FLAGS_reuse_lifetime_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteReuseLifetime(label, buckets); + } + } + + if (FLAGS_analyze_blocks_reuse_k_reuse_window != 0) { + std::vector<TraceType> block_types{TraceType::kBlockTraceIndexBlock, + TraceType::kBlockTraceDataBlock, + TraceType::kBlockTraceFilterBlock}; + for (auto block_type : block_types) { + analyzer.WriteBlockReuseTimeline( + FLAGS_analyze_blocks_reuse_k_reuse_window, + /*user_access_only=*/true, block_type); + analyzer.WriteBlockReuseTimeline( + FLAGS_analyze_blocks_reuse_k_reuse_window, + /*user_access_only=*/false, block_type); + } + } + + if (!FLAGS_analyze_get_spatial_locality_labels.empty() && + !FLAGS_analyze_get_spatial_locality_buckets.empty()) { + std::vector<uint64_t> buckets = + parse_buckets(FLAGS_analyze_get_spatial_locality_buckets); + std::stringstream ss(FLAGS_analyze_get_spatial_locality_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteGetSpatialLocality(label, buckets); + } + } + + if (!FLAGS_analyze_correlation_coefficients_labels.empty()) { + std::stringstream ss(FLAGS_analyze_correlation_coefficients_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteCorrelationFeatures( + label, FLAGS_analyze_correlation_coefficients_max_number_of_values); + } + analyzer.WriteCorrelationFeaturesForGet( + FLAGS_analyze_correlation_coefficients_max_number_of_values); + } + + if (!FLAGS_skew_labels.empty() && !FLAGS_skew_buckets.empty()) { + std::vector<uint64_t> buckets = parse_buckets(FLAGS_skew_buckets); + std::stringstream ss(FLAGS_skew_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + if (label.find("block") != std::string::npos) { + analyzer.WriteSkewness(label, buckets, + TraceType::kBlockTraceIndexBlock); + analyzer.WriteSkewness(label, buckets, + TraceType::kBlockTraceFilterBlock); + analyzer.WriteSkewness(label, buckets, TraceType::kBlockTraceDataBlock); + analyzer.WriteSkewness(label, buckets, TraceType::kTraceMax); + } else { + analyzer.WriteSkewness(label, buckets, TraceType::kTraceMax); + } + } + } + return 0; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h new file mode 100644 index 000000000..2f1ebd139 --- /dev/null +++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h @@ -0,0 +1,397 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <map> +#include <set> +#include <vector> + +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "rocksdb/trace_record.h" +#include "rocksdb/utilities/sim_cache.h" +#include "trace_replay/block_cache_tracer.h" +#include "utilities/simulator_cache/cache_simulator.h" + +namespace ROCKSDB_NAMESPACE { + +// Statistics of a key refereneced by a Get. +struct GetKeyInfo { + uint64_t key_id = 0; + std::vector<uint64_t> access_sequence_number_timeline; + std::vector<uint64_t> access_timeline; + + void AddAccess(const BlockCacheTraceRecord& access, + uint64_t access_sequnce_number) { + access_sequence_number_timeline.push_back(access_sequnce_number); + access_timeline.push_back(access.access_timestamp); + } +}; + +// Statistics of a block. +struct BlockAccessInfo { + uint64_t block_id = 0; + uint64_t table_id = 0; + uint64_t block_offset = 0; + uint64_t num_accesses = 0; + uint64_t block_size = 0; + uint64_t first_access_time = 0; + uint64_t last_access_time = 0; + uint64_t num_keys = 0; + std::map<std::string, std::map<TableReaderCaller, uint64_t>> + key_num_access_map; // for keys exist in this block. + std::map<std::string, std::map<TableReaderCaller, uint64_t>> + non_exist_key_num_access_map; // for keys do not exist in this block. + uint64_t num_referenced_key_exist_in_block = 0; + uint64_t referenced_data_size = 0; + std::map<TableReaderCaller, uint64_t> caller_num_access_map; + // caller:timestamp:number_of_accesses. The granularity of the timestamp is + // seconds. + std::map<TableReaderCaller, std::map<uint64_t, uint64_t>> + caller_num_accesses_timeline; + // Unique blocks since the last access. + std::set<std::string> unique_blocks_since_last_access; + // Number of reuses grouped by reuse distance. + std::map<uint64_t, uint64_t> reuse_distance_count; + + // The access sequence numbers of this block. + std::vector<uint64_t> access_sequence_number_timeline; + std::map<TableReaderCaller, std::vector<uint64_t>> + caller_access_sequence__number_timeline; + // The access timestamp in microseconds of this block. + std::vector<uint64_t> access_timeline; + std::map<TableReaderCaller, std::vector<uint64_t>> caller_access_timeline; + + void AddAccess(const BlockCacheTraceRecord& access, + uint64_t access_sequnce_number) { + if (block_size != 0 && access.block_size != 0) { + assert(block_size == access.block_size); + } + if (num_keys != 0 && access.num_keys_in_block != 0) { + assert(num_keys == access.num_keys_in_block); + } + if (first_access_time == 0) { + first_access_time = access.access_timestamp; + } + table_id = BlockCacheTraceHelper::GetTableId(access); + block_offset = BlockCacheTraceHelper::GetBlockOffsetInFile(access); + last_access_time = access.access_timestamp; + block_size = access.block_size; + caller_num_access_map[access.caller]++; + num_accesses++; + // access.access_timestamp is in microsecond. + const uint64_t timestamp_in_seconds = + access.access_timestamp / kMicrosInSecond; + caller_num_accesses_timeline[access.caller][timestamp_in_seconds] += 1; + // Populate the feature vectors. + access_sequence_number_timeline.push_back(access_sequnce_number); + caller_access_sequence__number_timeline[access.caller].push_back( + access_sequnce_number); + access_timeline.push_back(access.access_timestamp); + caller_access_timeline[access.caller].push_back(access.access_timestamp); + if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(access.block_type, + access.caller)) { + num_keys = access.num_keys_in_block; + if (access.referenced_key_exist_in_block) { + if (key_num_access_map.find(access.referenced_key) == + key_num_access_map.end()) { + referenced_data_size += access.referenced_data_size; + } + key_num_access_map[access.referenced_key][access.caller]++; + num_referenced_key_exist_in_block++; + if (referenced_data_size > block_size && block_size != 0) { + ParsedInternalKey internal_key; + Status s = ParseInternalKey(access.referenced_key, &internal_key, + false /* log_err_key */); // TODO + assert(s.ok()); // TODO + } + } else { + non_exist_key_num_access_map[access.referenced_key][access.caller]++; + } + } + } +}; + +// Aggregates stats of a block given a block type. +struct BlockTypeAccessInfoAggregate { + std::map<std::string, BlockAccessInfo> block_access_info_map; +}; + +// Aggregates BlockTypeAggregate given a SST file. +struct SSTFileAccessInfoAggregate { + uint32_t level; + std::map<TraceType, BlockTypeAccessInfoAggregate> block_type_aggregates_map; +}; + +// Aggregates SSTFileAggregate given a column family. +struct ColumnFamilyAccessInfoAggregate { + std::map<uint64_t, SSTFileAccessInfoAggregate> fd_aggregates_map; +}; + +struct Features { + std::vector<uint64_t> elapsed_time_since_last_access; + std::vector<uint64_t> num_accesses_since_last_access; + std::vector<uint64_t> num_past_accesses; +}; + +struct Predictions { + std::vector<uint64_t> elapsed_time_till_next_access; + std::vector<uint64_t> num_accesses_till_next_access; +}; + +class BlockCacheTraceAnalyzer { + public: + BlockCacheTraceAnalyzer( + const std::string& trace_file_path, const std::string& output_dir, + const std::string& human_readable_trace_file_path, + bool compute_reuse_distance, bool mrc_only, + bool is_human_readable_trace_file, + std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator); + ~BlockCacheTraceAnalyzer() = default; + // No copy and move. + BlockCacheTraceAnalyzer(const BlockCacheTraceAnalyzer&) = delete; + BlockCacheTraceAnalyzer& operator=(const BlockCacheTraceAnalyzer&) = delete; + BlockCacheTraceAnalyzer(BlockCacheTraceAnalyzer&&) = delete; + BlockCacheTraceAnalyzer& operator=(BlockCacheTraceAnalyzer&&) = delete; + + // Read all access records in the given trace_file, maintains the stats of + // a block, and aggregates the information by block type, sst file, and column + // family. Subsequently, the caller may call Print* functions to print + // statistics. + Status Analyze(); + + // Print a summary of statistics of the trace, e.g., + // Number of files: 2 Number of blocks: 50 Number of accesses: 50 + // Number of Index blocks: 10 + // Number of Filter blocks: 10 + // Number of Data blocks: 10 + // Number of UncompressionDict blocks: 10 + // Number of RangeDeletion blocks: 10 + // *************************************************************** + // Caller Get: Number of accesses 10 + // Caller Get: Number of accesses per level break down + // Level 0: Number of accesses: 10 + // Caller Get: Number of accesses per block type break down + // Block Type Index: Number of accesses: 2 + // Block Type Filter: Number of accesses: 2 + // Block Type Data: Number of accesses: 2 + // Block Type UncompressionDict: Number of accesses: 2 + // Block Type RangeDeletion: Number of accesses: 2 + void PrintStatsSummary() const; + + // Print block size distribution and the distribution break down by block type + // and column family. + void PrintBlockSizeStats() const; + + // Print access count distribution and the distribution break down by block + // type and column family. + void PrintAccessCountStats(bool user_access_only, uint32_t bottom_k, + uint32_t top_k) const; + + // Print data block accesses by user Get and Multi-Get. + // It prints out 1) A histogram on the percentage of keys accessed in a data + // block break down by if a referenced key exists in the data block andthe + // histogram break down by column family. 2) A histogram on the percentage of + // accesses on keys exist in a data block and its break down by column family. + void PrintDataBlockAccessStats() const; + + // Write the percentage of accesses break down by column family into a csv + // file saved in 'output_dir'. + // + // The file is named "percentage_of_accesses_summary". The file format is + // caller,cf_0,cf_1,...,cf_n where the cf_i is the column family name found in + // the trace. + void WritePercentAccessSummaryStats() const; + + // Write the percentage of accesses for the given caller break down by column + // family, level, and block type into a csv file saved in 'output_dir'. + // + // It generates two files: 1) caller_level_percentage_of_accesses_summary and + // 2) caller_bt_percentage_of_accesses_summary which break down by the level + // and block type, respectively. The file format is + // level/bt,cf_0,cf_1,...,cf_n where cf_i is the column family name found in + // the trace. + void WriteDetailedPercentAccessSummaryStats(TableReaderCaller caller) const; + + // Write the access count summary into a csv file saved in 'output_dir'. + // It groups blocks by their access count. + // + // It generates two files: 1) cf_access_count_summary and 2) + // bt_access_count_summary which break down the access count by column family + // and block type, respectively. The file format is + // cf/bt,bucket_0,bucket_1,...,bucket_N. + void WriteAccessCountSummaryStats( + const std::vector<uint64_t>& access_count_buckets, + bool user_access_only) const; + + // Write miss ratio curves of simulated cache configurations into a csv file + // named "mrc" saved in 'output_dir'. + // + // The file format is + // "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses". + void WriteMissRatioCurves() const; + + // Write miss ratio timeline of simulated cache configurations into several + // csv files, one per cache capacity saved in 'output_dir'. + // + // The file format is + // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second" + // where N is the number of unique cache names + // (cache_name+num_shard_bits+ghost_capacity). + void WriteMissRatioTimeline(uint64_t time_unit) const; + + // Write misses timeline of simulated cache configurations into several + // csv files, one per cache capacity saved in 'output_dir'. + // + // The file format is + // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second" + // where N is the number of unique cache names + // (cache_name+num_shard_bits+ghost_capacity). + void WriteMissTimeline(uint64_t time_unit) const; + + // Write the access timeline into a csv file saved in 'output_dir'. + // + // The file is named "label_access_timeline".The file format is + // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second" + // where N is the number of unique labels found in the trace. + void WriteAccessTimeline(const std::string& label, uint64_t time_unit, + bool user_access_only) const; + + // Write the reuse distance into a csv file saved in 'output_dir'. Reuse + // distance is defined as the cumulated size of unique blocks read between two + // consective accesses on the same block. + // + // The file is named "label_reuse_distance". The file format is + // bucket,label_1,label_2,...,label_N. + void WriteReuseDistance(const std::string& label_str, + const std::vector<uint64_t>& distance_buckets) const; + + // Write the reuse interval into a csv file saved in 'output_dir'. Reuse + // interval is defined as the time between two consecutive accesses on the + // same block. + // + // The file is named "label_reuse_interval". The file format is + // bucket,label_1,label_2,...,label_N. + void WriteReuseInterval(const std::string& label_str, + const std::vector<uint64_t>& time_buckets) const; + + // Write the reuse lifetime into a csv file saved in 'output_dir'. Reuse + // lifetime is defined as the time interval between the first access of a + // block and its last access. + // + // The file is named "label_reuse_lifetime". The file format is + // bucket,label_1,label_2,...,label_N. + void WriteReuseLifetime(const std::string& label_str, + const std::vector<uint64_t>& time_buckets) const; + + // Write the reuse timeline into a csv file saved in 'output_dir'. + // + // The file is named + // "block_type_user_access_only_reuse_window_reuse_timeline". The file format + // is start_time,0,1,...,N where N equals trace_duration / reuse_window. + void WriteBlockReuseTimeline(const uint64_t reuse_window, + bool user_access_only, + TraceType block_type) const; + + // Write the Get spatical locality into csv files saved in 'output_dir'. + // + // It generates three csv files. label_percent_ref_keys, + // label_percent_accesses_on_ref_keys, and + // label_percent_data_size_on_ref_keys. + void WriteGetSpatialLocality( + const std::string& label_str, + const std::vector<uint64_t>& percent_buckets) const; + + void WriteCorrelationFeatures(const std::string& label_str, + uint32_t max_number_of_values) const; + + void WriteCorrelationFeaturesForGet(uint32_t max_number_of_values) const; + + void WriteSkewness(const std::string& label_str, + const std::vector<uint64_t>& percent_buckets, + TraceType target_block_type) const; + + const std::map<std::string, ColumnFamilyAccessInfoAggregate>& + TEST_cf_aggregates_map() const { + return cf_aggregates_map_; + } + + private: + std::set<std::string> ParseLabelStr(const std::string& label_str) const; + + std::string BuildLabel(const std::set<std::string>& labels, + const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + TableReaderCaller caller, uint64_t block_key, + const BlockAccessInfo& block) const; + + void ComputeReuseDistance(BlockAccessInfo* info) const; + + Status RecordAccess(const BlockCacheTraceRecord& access); + + void UpdateReuseIntervalStats( + const std::string& label, const std::vector<uint64_t>& time_buckets, + const std::map<uint64_t, uint64_t> timeline, + std::map<std::string, std::map<uint64_t, uint64_t>>* + label_time_num_reuses, + uint64_t* total_num_reuses) const; + + std::string OutputPercentAccessStats( + uint64_t total_accesses, + const std::map<std::string, uint64_t>& cf_access_count) const; + + void WriteStatsToFile( + const std::string& label_str, const std::vector<uint64_t>& time_buckets, + const std::string& filename_suffix, + const std::map<std::string, std::map<uint64_t, uint64_t>>& label_data, + uint64_t ntotal) const; + + void TraverseBlocks( + std::function<void(const std::string& /*cf_name*/, uint64_t /*fd*/, + uint32_t /*level*/, TraceType /*block_type*/, + const std::string& /*block_key*/, + uint64_t /*block_key_id*/, + const BlockAccessInfo& /*block_access_info*/)> + block_callback, + std::set<std::string>* labels = nullptr) const; + + void UpdateFeatureVectors( + const std::vector<uint64_t>& access_sequence_number_timeline, + const std::vector<uint64_t>& access_timeline, const std::string& label, + std::map<std::string, Features>* label_features, + std::map<std::string, Predictions>* label_predictions) const; + + void WriteCorrelationFeaturesToFile( + const std::string& label, + const std::map<std::string, Features>& label_features, + const std::map<std::string, Predictions>& label_predictions, + uint32_t max_number_of_values) const; + + ROCKSDB_NAMESPACE::Env* env_; + const std::string trace_file_path_; + const std::string output_dir_; + std::string human_readable_trace_file_path_; + const bool compute_reuse_distance_; + const bool mrc_only_; + const bool is_human_readable_trace_file_; + + BlockCacheTraceHeader header_; + std::unique_ptr<BlockCacheTraceSimulator> cache_simulator_; + std::map<std::string, ColumnFamilyAccessInfoAggregate> cf_aggregates_map_; + std::map<std::string, BlockAccessInfo*> block_info_map_; + std::unordered_map<std::string, GetKeyInfo> get_key_info_map_; + uint64_t access_sequence_number_ = 0; + uint64_t trace_start_timestamp_in_seconds_ = 0; + uint64_t trace_end_timestamp_in_seconds_ = 0; + MissRatioStats miss_ratio_stats_; + uint64_t unique_block_id_ = 1; + uint64_t unique_get_key_id_ = 1; + BlockCacheHumanReadableTraceWriter human_readable_trace_writer_; +}; + +int block_cache_trace_analyzer_tool(int argc, char** argv); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py new file mode 100644 index 000000000..37166bcb4 --- /dev/null +++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py @@ -0,0 +1,729 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +#!/usr/bin/env python3 + +import csv +import math +import os +import random +import sys + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.backends.backend_pdf +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + + +# Make sure a legend has the same color across all generated graphs. +def get_cmap(n, name="hsv"): + """Returns a function that maps each index in 0, 1, ..., n-1 to a distinct + RGB color; the keyword argument name must be a standard mpl colormap name.""" + return plt.cm.get_cmap(name, n) + + +color_index = 0 +bar_color_maps = {} +colors = [] +n_colors = 360 +linear_colors = get_cmap(n_colors) +for i in range(n_colors): + colors.append(linear_colors(i)) +# Shuffle the colors so that adjacent bars in a graph are obvious to differentiate. +random.shuffle(colors) + + +def num_to_gb(n): + one_gb = 1024 * 1024 * 1024 + if float(n) % one_gb == 0: + return "{}".format(n / one_gb) + # Keep two decimal points. + return "{0:.2f}".format(float(n) / one_gb) + + +def plot_miss_stats_graphs( + csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name +): + miss_ratios = {} + for file in os.listdir(csv_result_dir): + if not file.startswith(file_prefix): + continue + if not file.endswith(file_suffix): + continue + print("Processing file {}/{}".format(csv_result_dir, file)) + mrc_file_path = csv_result_dir + "/" + file + with open(mrc_file_path, "r") as csvfile: + rows = csv.reader(csvfile, delimiter=",") + for row in rows: + cache_name = row[0] + num_shard_bits = int(row[1]) + ghost_capacity = int(row[2]) + capacity = int(row[3]) + miss_ratio = float(row[4]) + config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity) + if config not in miss_ratios: + miss_ratios[config] = {} + miss_ratios[config]["x"] = [] + miss_ratios[config]["y"] = [] + miss_ratios[config]["x"].append(capacity) + miss_ratios[config]["y"].append(miss_ratio) + fig = plt.figure() + for config in miss_ratios: + plt.plot( + miss_ratios[config]["x"], miss_ratios[config]["y"], label=config + ) + plt.xlabel("Cache capacity") + plt.ylabel(ylabel) + plt.xscale("log", basex=2) + plt.ylim(ymin=0) + plt.title("{}".format(file)) + plt.legend() + fig.savefig( + output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight" + ) + + +def plot_miss_stats_diff_lru_graphs( + csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name +): + miss_ratios = {} + for file in os.listdir(csv_result_dir): + if not file.startswith(file_prefix): + continue + if not file.endswith(file_suffix): + continue + print("Processing file {}/{}".format(csv_result_dir, file)) + mrc_file_path = csv_result_dir + "/" + file + with open(mrc_file_path, "r") as csvfile: + rows = csv.reader(csvfile, delimiter=",") + for row in rows: + cache_name = row[0] + num_shard_bits = int(row[1]) + ghost_capacity = int(row[2]) + capacity = int(row[3]) + miss_ratio = float(row[4]) + config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity) + if config not in miss_ratios: + miss_ratios[config] = {} + miss_ratios[config]["x"] = [] + miss_ratios[config]["y"] = [] + miss_ratios[config]["x"].append(capacity) + miss_ratios[config]["y"].append(miss_ratio) + if "lru-0-0" not in miss_ratios: + return + fig = plt.figure() + for config in miss_ratios: + diffs = [0] * len(miss_ratios["lru-0-0"]["x"]) + for i in range(len(miss_ratios["lru-0-0"]["x"])): + for j in range(len(miss_ratios[config]["x"])): + if miss_ratios["lru-0-0"]["x"][i] == miss_ratios[config]["x"][j]: + diffs[i] = ( + miss_ratios[config]["y"][j] - miss_ratios["lru-0-0"]["y"][i] + ) + break + plt.plot(miss_ratios["lru-0-0"]["x"], diffs, label=config) + plt.xlabel("Cache capacity") + plt.ylabel(ylabel) + plt.xscale("log", basex=2) + plt.title("{}".format(file)) + plt.legend() + fig.savefig( + output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight" + ) + + +def sanitize(label): + # matplotlib cannot plot legends that is prefixed with "_" + # so we need to remove them here. + index = 0 + for i in range(len(label)): + if label[i] == "_": + index += 1 + else: + break + data = label[index:] + # The value of uint64_max in c++. + if "18446744073709551615" in data: + return "max" + return data + + +# Read the csv file vertically, i.e., group the data by columns. +def read_data_for_plot_vertical(csvfile): + x = [] + labels = [] + label_stats = {} + csv_rows = csv.reader(csvfile, delimiter=",") + data_rows = [] + for row in csv_rows: + data_rows.append(row) + # header + for i in range(1, len(data_rows[0])): + labels.append(sanitize(data_rows[0][i])) + label_stats[i - 1] = [] + for i in range(1, len(data_rows)): + for j in range(len(data_rows[i])): + if j == 0: + x.append(sanitize(data_rows[i][j])) + continue + label_stats[j - 1].append(float(data_rows[i][j])) + return x, labels, label_stats + + +# Read the csv file horizontally, i.e., group the data by rows. +def read_data_for_plot_horizontal(csvfile): + x = [] + labels = [] + label_stats = {} + csv_rows = csv.reader(csvfile, delimiter=",") + data_rows = [] + for row in csv_rows: + data_rows.append(row) + # header + for i in range(1, len(data_rows)): + labels.append(sanitize(data_rows[i][0])) + label_stats[i - 1] = [] + for i in range(1, len(data_rows[0])): + x.append(sanitize(data_rows[0][i])) + for i in range(1, len(data_rows)): + for j in range(len(data_rows[i])): + if j == 0: + # label + continue + label_stats[i - 1].append(float(data_rows[i][j])) + return x, labels, label_stats + + +def read_data_for_plot(csvfile, vertical): + if vertical: + return read_data_for_plot_vertical(csvfile) + return read_data_for_plot_horizontal(csvfile) + + +def plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix, + filename_suffix, + pdf_name, + xlabel, + ylabel, + title, + vertical, + legend, +): + global color_index, bar_color_maps, colors + pdf = matplotlib.backends.backend_pdf.PdfPages(output_result_dir + "/" + pdf_name) + for file in os.listdir(csv_result_dir): + if not file.endswith(filename_suffix): + continue + if not file.startswith(filename_prefix): + continue + print("Processing file {}/{}".format(csv_result_dir, file)) + with open(csv_result_dir + "/" + file, "r") as csvfile: + x, labels, label_stats = read_data_for_plot(csvfile, vertical) + if len(x) == 0 or len(labels) == 0: + continue + # plot figure + fig = plt.figure() + for label_index in label_stats: + # Assign a unique color to this label. + if labels[label_index] not in bar_color_maps: + bar_color_maps[labels[label_index]] = colors[color_index] + color_index += 1 + plt.plot( + [int(x[i]) for i in range(len(x) - 1)], + label_stats[label_index][:-1], + label=labels[label_index], + color=bar_color_maps[labels[label_index]], + ) + + # Translate time unit into x labels. + if "_60" in file: + plt.xlabel("{} (Minute)".format(xlabel)) + if "_3600" in file: + plt.xlabel("{} (Hour)".format(xlabel)) + plt.ylabel(ylabel) + plt.title("{} {}".format(title, file)) + if legend: + plt.legend() + pdf.savefig(fig) + pdf.close() + + +def plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix, + pdf_name, + xlabel, + ylabel, + title, + vertical, + x_prefix, +): + global color_index, bar_color_maps, colors + pdf = matplotlib.backends.backend_pdf.PdfPages( + "{}/{}".format(output_result_dir, pdf_name) + ) + for file in os.listdir(csv_result_dir): + if not file.endswith(filename_suffix): + continue + with open(csv_result_dir + "/" + file, "r") as csvfile: + print("Processing file {}/{}".format(csv_result_dir, file)) + x, labels, label_stats = read_data_for_plot(csvfile, vertical) + if len(x) == 0 or len(label_stats) == 0: + continue + # Plot figure + fig = plt.figure() + ind = np.arange(len(x)) # the x locations for the groups + width = 0.5 # the width of the bars: can also be len(x) sequence + bars = [] + bottom_bars = [] + for _i in label_stats[0]: + bottom_bars.append(0) + for i in range(0, len(label_stats)): + # Assign a unique color to this label. + if labels[i] not in bar_color_maps: + bar_color_maps[labels[i]] = colors[color_index] + color_index += 1 + p = plt.bar( + ind, + label_stats[i], + width, + bottom=bottom_bars, + color=bar_color_maps[labels[i]], + ) + bars.append(p[0]) + for j in range(len(label_stats[i])): + bottom_bars[j] += label_stats[i][j] + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.xticks( + ind, [x_prefix + x[i] for i in range(len(x))], rotation=20, fontsize=8 + ) + plt.legend(bars, labels) + plt.title("{} filename:{}".format(title, file)) + pdf.savefig(fig) + pdf.close() + + +def plot_heatmap(csv_result_dir, output_result_dir, filename_suffix, pdf_name, title): + pdf = matplotlib.backends.backend_pdf.PdfPages( + "{}/{}".format(output_result_dir, pdf_name) + ) + for file in os.listdir(csv_result_dir): + if not file.endswith(filename_suffix): + continue + csv_file_name = "{}/{}".format(csv_result_dir, file) + print("Processing file {}/{}".format(csv_result_dir, file)) + corr_table = pd.read_csv(csv_file_name) + corr_table = corr_table.pivot("label", "corr", "value") + fig = plt.figure() + sns.heatmap(corr_table, annot=True, linewidths=0.5, fmt=".2") + plt.title("{} filename:{}".format(title, file)) + pdf.savefig(fig) + pdf.close() + + +def plot_timeline(csv_result_dir, output_result_dir): + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="access_timeline", + pdf_name="access_time.pdf", + xlabel="Time", + ylabel="Throughput", + title="Access timeline with group by label", + vertical=False, + legend=True, + ) + + +def convert_to_0_if_nan(n): + if math.isnan(n): + return 0.0 + return n + + +def plot_correlation(csv_result_dir, output_result_dir): + # Processing the correlation input first. + label_str_file = {} + for file in os.listdir(csv_result_dir): + if not file.endswith("correlation_input"): + continue + csv_file_name = "{}/{}".format(csv_result_dir, file) + print("Processing file {}/{}".format(csv_result_dir, file)) + corr_table = pd.read_csv(csv_file_name) + label_str = file.split("_")[0] + label = file[len(label_str) + 1 :] + label = label[: len(label) - len("_correlation_input")] + + output_file = "{}/{}_correlation_output".format(csv_result_dir, label_str) + if output_file not in label_str_file: + f = open("{}/{}_correlation_output".format(csv_result_dir, label_str), "w+") + label_str_file[output_file] = f + f.write("label,corr,value\n") + f = label_str_file[output_file] + f.write( + "{},{},{}\n".format( + label, + "LA+A", + convert_to_0_if_nan( + corr_table["num_accesses_since_last_access"].corr( + corr_table["num_accesses_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "PA+A", + convert_to_0_if_nan( + corr_table["num_past_accesses"].corr( + corr_table["num_accesses_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "LT+A", + convert_to_0_if_nan( + corr_table["elapsed_time_since_last_access"].corr( + corr_table["num_accesses_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "LA+T", + convert_to_0_if_nan( + corr_table["num_accesses_since_last_access"].corr( + corr_table["elapsed_time_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "LT+T", + convert_to_0_if_nan( + corr_table["elapsed_time_since_last_access"].corr( + corr_table["elapsed_time_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "PA+T", + convert_to_0_if_nan( + corr_table["num_past_accesses"].corr( + corr_table["elapsed_time_till_next_access"], method="spearman" + ) + ), + ) + ) + for label_str in label_str_file: + label_str_file[label_str].close() + + plot_heatmap( + csv_result_dir, + output_result_dir, + "correlation_output", + "correlation.pdf", + "Correlation", + ) + + +def plot_reuse_graphs(csv_result_dir, output_result_dir): + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="avg_reuse_interval_naccesses", + pdf_name="avg_reuse_interval_naccesses.pdf", + xlabel="", + ylabel="Percentage of accesses", + title="Average reuse interval", + vertical=True, + x_prefix="< ", + ) + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="avg_reuse_interval", + pdf_name="avg_reuse_interval.pdf", + xlabel="", + ylabel="Percentage of blocks", + title="Average reuse interval", + vertical=True, + x_prefix="< ", + ) + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="access_reuse_interval", + pdf_name="reuse_interval.pdf", + xlabel="Seconds", + ylabel="Percentage of accesses", + title="Reuse interval", + vertical=True, + x_prefix="< ", + ) + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="reuse_lifetime", + pdf_name="reuse_lifetime.pdf", + xlabel="Seconds", + ylabel="Percentage of blocks", + title="Reuse lifetime", + vertical=True, + x_prefix="< ", + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="reuse_blocks_timeline", + pdf_name="reuse_blocks_timeline.pdf", + xlabel="", + ylabel="Percentage of blocks", + title="Reuse blocks timeline", + vertical=False, + legend=False, + ) + + +def plot_percentage_access_summary(csv_result_dir, output_result_dir): + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="percentage_of_accesses_summary", + pdf_name="percentage_access.pdf", + xlabel="", + ylabel="Percentage of accesses", + title="", + vertical=True, + x_prefix="", + ) + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="percent_ref_keys", + pdf_name="percent_ref_keys.pdf", + xlabel="", + ylabel="Percentage of blocks", + title="", + vertical=True, + x_prefix="", + ) + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="percent_data_size_on_ref_keys", + pdf_name="percent_data_size_on_ref_keys.pdf", + xlabel="", + ylabel="Percentage of blocks", + title="", + vertical=True, + x_prefix="", + ) + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="percent_accesses_on_ref_keys", + pdf_name="percent_accesses_on_ref_keys.pdf", + xlabel="", + ylabel="Percentage of blocks", + title="", + vertical=True, + x_prefix="", + ) + + +def plot_access_count_summary(csv_result_dir, output_result_dir): + plot_stacked_bar_charts( + csv_result_dir, + output_result_dir, + filename_suffix="access_count_summary", + pdf_name="access_count_summary.pdf", + xlabel="Access count", + ylabel="Percentage of blocks", + title="", + vertical=True, + x_prefix="< ", + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="skewness", + pdf_name="skew.pdf", + xlabel="", + ylabel="Percentage of accesses", + title="Skewness", + vertical=True, + legend=False, + ) + + +def plot_miss_ratio_timeline(csv_result_dir, output_result_dir): + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_miss_ratio_timeline", + pdf_name="miss_ratio_timeline.pdf", + xlabel="Time", + ylabel="Miss Ratio (%)", + title="Miss ratio timeline", + vertical=False, + legend=True, + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_miss_timeline", + pdf_name="miss_timeline.pdf", + xlabel="Time", + ylabel="# of misses ", + title="Miss timeline", + vertical=False, + legend=True, + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_miss_timeline", + pdf_name="miss_timeline.pdf", + xlabel="Time", + ylabel="# of misses ", + title="Miss timeline", + vertical=False, + legend=True, + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_policy_timeline", + pdf_name="policy_timeline.pdf", + xlabel="Time", + ylabel="# of times a policy is selected ", + title="Policy timeline", + vertical=False, + legend=True, + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_policy_ratio_timeline", + pdf_name="policy_ratio_timeline.pdf", + xlabel="Time", + ylabel="Percentage of times a policy is selected ", + title="Policy timeline", + vertical=False, + legend=True, + ) + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print( + "Must provide two arguments: \n" + "1) The directory that saves a list of " + "directories which contain block cache trace analyzer result files. \n" + "2) the directory to save plotted graphs. \n" + ) + exit(1) + csv_result_dir = sys.argv[1] + output_result_dir = sys.argv[2] + print( + "Processing directory {} and save graphs to {}.".format( + csv_result_dir, output_result_dir + ) + ) + for csv_relative_dir in os.listdir(csv_result_dir): + csv_abs_dir = csv_result_dir + "/" + csv_relative_dir + result_dir = output_result_dir + "/" + csv_relative_dir + if not os.path.isdir(csv_abs_dir): + print("{} is not a directory".format(csv_abs_dir)) + continue + print("Processing experiment dir: {}".format(csv_relative_dir)) + if not os.path.exists(result_dir): + os.makedirs(result_dir) + plot_access_count_summary(csv_abs_dir, result_dir) + plot_timeline(csv_abs_dir, result_dir) + plot_miss_ratio_timeline(csv_result_dir, output_result_dir) + plot_correlation(csv_abs_dir, result_dir) + plot_reuse_graphs(csv_abs_dir, result_dir) + plot_percentage_access_summary(csv_abs_dir, result_dir) + plot_miss_stats_graphs( + csv_abs_dir, + result_dir, + file_prefix="", + file_suffix="mrc", + ylabel="Miss ratio (%)", + pdf_file_name="mrc", + ) + plot_miss_stats_diff_lru_graphs( + csv_abs_dir, + result_dir, + file_prefix="", + file_suffix="mrc", + ylabel="Miss ratio (%)", + pdf_file_name="mrc_diff_lru", + ) + # The following stats are only available in pysim. + for time_unit in ["1", "60", "3600"]: + plot_miss_stats_graphs( + csv_abs_dir, + result_dir, + file_prefix="ml_{}_".format(time_unit), + file_suffix="p95mb", + ylabel="p95 number of byte miss per {} seconds".format(time_unit), + pdf_file_name="p95mb_per{}_seconds".format(time_unit), + ) + plot_miss_stats_graphs( + csv_abs_dir, + result_dir, + file_prefix="ml_{}_".format(time_unit), + file_suffix="avgmb", + ylabel="Average number of byte miss per {} seconds".format(time_unit), + pdf_file_name="avgmb_per{}_seconds".format(time_unit), + ) + plot_miss_stats_diff_lru_graphs( + csv_abs_dir, + result_dir, + file_prefix="ml_{}_".format(time_unit), + file_suffix="p95mb", + ylabel="p95 number of byte miss per {} seconds".format(time_unit), + pdf_file_name="p95mb_per{}_seconds_diff_lru".format(time_unit), + ) + plot_miss_stats_diff_lru_graphs( + csv_abs_dir, + result_dir, + file_prefix="ml_{}_".format(time_unit), + file_suffix="avgmb", + ylabel="Average number of byte miss per {} seconds".format(time_unit), + pdf_file_name="avgmb_per{}_seconds_diff_lru".format(time_unit), + ) diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc new file mode 100644 index 000000000..c5d9b1452 --- /dev/null +++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc @@ -0,0 +1,735 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, + "Please install gflags to run block_cache_trace_analyzer_test\n"); + return 0; +} +#else + +#include <fstream> +#include <iostream> +#include <map> +#include <vector> + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_reader_writer.h" +#include "rocksdb/trace_record.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h" +#include "trace_replay/block_cache_tracer.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +const uint64_t kBlockSize = 1024; +const std::string kBlockKeyPrefix = "test-block-"; +const uint32_t kCFId = 0; +const uint32_t kLevel = 1; +const uint64_t kSSTStoringEvenKeys = 100; +const uint64_t kSSTStoringOddKeys = 101; +const std::string kRefKeyPrefix = "test-get-"; +const uint64_t kNumKeysInBlock = 1024; +const int kMaxArgCount = 100; +const size_t kArgBufferSize = 100000; +} // namespace + +class BlockCacheTracerTest : public testing::Test { + public: + BlockCacheTracerTest() { + test_path_ = test::PerThreadDBPath("block_cache_trace_analyzer_test"); + env_ = ROCKSDB_NAMESPACE::Env::Default(); + EXPECT_OK(env_->CreateDir(test_path_)); + trace_file_path_ = test_path_ + "/block_cache_trace"; + block_cache_sim_config_path_ = test_path_ + "/block_cache_sim_config"; + timeline_labels_ = + "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller"; + reuse_distance_labels_ = + "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller"; + reuse_distance_buckets_ = "1,1K,1M,1G"; + reuse_interval_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt"; + reuse_interval_buckets_ = "1,10,100,1000"; + reuse_lifetime_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt"; + reuse_lifetime_buckets_ = "1,10,100,1000"; + analyzing_callers_ = "Get,Iterator"; + access_count_buckets_ = "2,3,4,5,10"; + analyze_get_spatial_locality_labels_ = "all"; + analyze_get_spatial_locality_buckets_ = "10,20,30,40,50,60,70,80,90,100"; + } + + ~BlockCacheTracerTest() override { + if (getenv("KEEP_DB")) { + printf("The trace file is still at %s\n", trace_file_path_.c_str()); + return; + } + EXPECT_OK(env_->DeleteFile(trace_file_path_)); + EXPECT_OK(env_->DeleteDir(test_path_)); + } + + TableReaderCaller GetCaller(uint32_t key_id) { + uint32_t n = key_id % 5; + switch (n) { + case 0: + return TableReaderCaller::kPrefetch; + case 1: + return TableReaderCaller::kCompaction; + case 2: + return TableReaderCaller::kUserGet; + case 3: + return TableReaderCaller::kUserMultiGet; + case 4: + return TableReaderCaller::kUserIterator; + } + // This cannot happend. + assert(false); + return TableReaderCaller::kMaxBlockCacheLookupCaller; + } + + void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id, + TraceType block_type, uint32_t nblocks) { + assert(writer); + for (uint32_t i = 0; i < nblocks; i++) { + uint32_t key_id = from_key_id + i; + uint64_t timestamp = (key_id + 1) * kMicrosInSecond; + BlockCacheTraceRecord record; + record.block_type = block_type; + record.block_size = kBlockSize + key_id; + record.block_key = kBlockKeyPrefix + std::to_string(key_id); + record.access_timestamp = timestamp; + record.cf_id = kCFId; + record.cf_name = kDefaultColumnFamilyName; + record.caller = GetCaller(key_id); + record.level = kLevel; + if (key_id % 2 == 0) { + record.sst_fd_number = kSSTStoringEvenKeys; + } else { + record.sst_fd_number = kSSTStoringOddKeys; + } + record.is_cache_hit = false; + record.no_insert = false; + // Provide these fields for all block types. + // The writer should only write these fields for data blocks and the + // caller is either GET or MGET. + record.referenced_key = + kRefKeyPrefix + std::to_string(key_id) + std::string(8, 0); + record.referenced_key_exist_in_block = true; + record.num_keys_in_block = kNumKeysInBlock; + ASSERT_OK(writer->WriteBlockAccess( + record, record.block_key, record.cf_name, record.referenced_key)); + } + } + + void AssertBlockAccessInfo( + uint32_t key_id, TraceType type, + const std::map<std::string, BlockAccessInfo>& block_access_info_map) { + auto key_id_str = kBlockKeyPrefix + std::to_string(key_id); + ASSERT_TRUE(block_access_info_map.find(key_id_str) != + block_access_info_map.end()); + auto& block_access_info = block_access_info_map.find(key_id_str)->second; + ASSERT_EQ(1, block_access_info.num_accesses); + ASSERT_EQ(kBlockSize + key_id, block_access_info.block_size); + ASSERT_GT(block_access_info.first_access_time, 0); + ASSERT_GT(block_access_info.last_access_time, 0); + ASSERT_EQ(1, block_access_info.caller_num_access_map.size()); + TableReaderCaller expected_caller = GetCaller(key_id); + ASSERT_TRUE(block_access_info.caller_num_access_map.find(expected_caller) != + block_access_info.caller_num_access_map.end()); + ASSERT_EQ( + 1, + block_access_info.caller_num_access_map.find(expected_caller)->second); + + if ((expected_caller == TableReaderCaller::kUserGet || + expected_caller == TableReaderCaller::kUserMultiGet) && + type == TraceType::kBlockTraceDataBlock) { + ASSERT_EQ(kNumKeysInBlock, block_access_info.num_keys); + ASSERT_EQ(1, block_access_info.key_num_access_map.size()); + ASSERT_EQ(0, block_access_info.non_exist_key_num_access_map.size()); + ASSERT_EQ(1, block_access_info.num_referenced_key_exist_in_block); + } + } + + void RunBlockCacheTraceAnalyzer() { + std::vector<std::string> params = { + "./block_cache_trace_analyzer", + "-block_cache_trace_path=" + trace_file_path_, + "-block_cache_sim_config_path=" + block_cache_sim_config_path_, + "-block_cache_analysis_result_dir=" + test_path_, + "-print_block_size_stats", + "-print_access_count_stats", + "-print_data_block_access_count_stats", + "-cache_sim_warmup_seconds=0", + "-analyze_bottom_k_access_count_blocks=5", + "-analyze_top_k_access_count_blocks=5", + "-analyze_blocks_reuse_k_reuse_window=5", + "-timeline_labels=" + timeline_labels_, + "-reuse_distance_labels=" + reuse_distance_labels_, + "-reuse_distance_buckets=" + reuse_distance_buckets_, + "-reuse_interval_labels=" + reuse_interval_labels_, + "-reuse_interval_buckets=" + reuse_interval_buckets_, + "-reuse_lifetime_labels=" + reuse_lifetime_labels_, + "-reuse_lifetime_buckets=" + reuse_lifetime_buckets_, + "-analyze_callers=" + analyzing_callers_, + "-access_count_buckets=" + access_count_buckets_, + "-analyze_get_spatial_locality_labels=" + + analyze_get_spatial_locality_labels_, + "-analyze_get_spatial_locality_buckets=" + + analyze_get_spatial_locality_buckets_, + "-analyze_correlation_coefficients_labels=all", + "-skew_labels=all", + "-skew_buckets=10,50,100"}; + char arg_buffer[kArgBufferSize]; + char* argv[kMaxArgCount]; + int argc = 0; + int cursor = 0; + for (const auto& arg : params) { + ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize); + ASSERT_LE(argc + 1, kMaxArgCount); + snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str()); + + argv[argc++] = arg_buffer + cursor; + cursor += static_cast<int>(arg.size()) + 1; + } + ASSERT_EQ(0, + ROCKSDB_NAMESPACE::block_cache_trace_analyzer_tool(argc, argv)); + } + + Env* env_; + EnvOptions env_options_; + std::string block_cache_sim_config_path_; + std::string trace_file_path_; + std::string test_path_; + std::string timeline_labels_; + std::string reuse_distance_labels_; + std::string reuse_distance_buckets_; + std::string reuse_interval_labels_; + std::string reuse_interval_buckets_; + std::string reuse_lifetime_labels_; + std::string reuse_lifetime_buckets_; + std::string analyzing_callers_; + std::string access_count_buckets_; + std::string analyze_get_spatial_locality_labels_; + std::string analyze_get_spatial_locality_buckets_; +}; + +TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { + { + // Generate a trace file. + BlockCacheTraceWriterOptions trace_writer_opt; + std::unique_ptr<TraceWriter> trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, + &trace_writer)); + const auto& clock = env_->GetSystemClock(); + std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer = + NewBlockCacheTraceWriter(clock.get(), trace_writer_opt, + std::move(trace_writer)); + ASSERT_NE(block_cache_trace_writer, nullptr); + ASSERT_OK(block_cache_trace_writer->WriteHeader()); + WriteBlockAccess(block_cache_trace_writer.get(), 0, + TraceType::kBlockTraceDataBlock, 50); + ASSERT_OK(env_->FileExists(trace_file_path_)); + } + { + // Generate a cache sim config. + std::string config = "lru,1,0,1K,1M,1G"; + std::ofstream out(block_cache_sim_config_path_); + ASSERT_TRUE(out.is_open()); + out << config << std::endl; + out.close(); + } + RunBlockCacheTraceAnalyzer(); + { + // Validate the cache miss ratios. + std::vector<uint64_t> expected_capacities{1024, 1024 * 1024, + 1024 * 1024 * 1024}; + const std::string mrc_path = test_path_ + "/49_50_mrc"; + std::ifstream infile(mrc_path); + uint32_t config_index = 0; + std::string line; + // Read header. + ASSERT_TRUE(getline(infile, line)); + while (getline(infile, line)) { + std::stringstream ss(line); + std::vector<std::string> result_strs; + while (ss.good()) { + std::string substr; + getline(ss, substr, ','); + result_strs.push_back(substr); + } + ASSERT_EQ(6, result_strs.size()); + ASSERT_LT(config_index, expected_capacities.size()); + ASSERT_EQ("lru", result_strs[0]); // cache_name + ASSERT_EQ("1", result_strs[1]); // num_shard_bits + ASSERT_EQ("0", result_strs[2]); // ghost_cache_capacity + ASSERT_EQ(std::to_string(expected_capacities[config_index]), + result_strs[3]); // cache_capacity + ASSERT_EQ("100.0000", result_strs[4]); // miss_ratio + ASSERT_EQ("50", result_strs[5]); // number of accesses. + config_index++; + } + ASSERT_EQ(expected_capacities.size(), config_index); + infile.close(); + ASSERT_OK(env_->DeleteFile(mrc_path)); + + const std::vector<std::string> time_units{"1", "60", "3600"}; + expected_capacities.push_back(std::numeric_limits<uint64_t>::max()); + for (auto const& expected_capacity : expected_capacities) { + for (auto const& time_unit : time_units) { + const std::string miss_ratio_timeline_path = + test_path_ + "/" + std::to_string(expected_capacity) + "_" + + time_unit + "_miss_ratio_timeline"; + std::ifstream mrt_file(miss_ratio_timeline_path); + // Read header. + ASSERT_TRUE(getline(mrt_file, line)); + ASSERT_TRUE(getline(mrt_file, line)); + std::stringstream ss(line); + bool read_header = false; + while (ss.good()) { + std::string substr; + getline(ss, substr, ','); + if (!read_header) { + if (expected_capacity == std::numeric_limits<uint64_t>::max()) { + ASSERT_EQ("trace", substr); + } else { + ASSERT_EQ("lru-1-0", substr); + } + read_header = true; + continue; + } + ASSERT_DOUBLE_EQ(100.0, ParseDouble(substr)); + } + ASSERT_FALSE(getline(mrt_file, line)); + mrt_file.close(); + ASSERT_OK(env_->DeleteFile(miss_ratio_timeline_path)); + } + for (auto const& time_unit : time_units) { + const std::string miss_timeline_path = + test_path_ + "/" + std::to_string(expected_capacity) + "_" + + time_unit + "_miss_timeline"; + std::ifstream mt_file(miss_timeline_path); + // Read header. + ASSERT_TRUE(getline(mt_file, line)); + ASSERT_TRUE(getline(mt_file, line)); + std::stringstream ss(line); + uint32_t num_misses = 0; + while (ss.good()) { + std::string substr; + getline(ss, substr, ','); + if (num_misses == 0) { + if (expected_capacity == std::numeric_limits<uint64_t>::max()) { + ASSERT_EQ("trace", substr); + } else { + ASSERT_EQ("lru-1-0", substr); + } + num_misses++; + continue; + } + num_misses += ParseInt(substr); + } + ASSERT_EQ(51u, num_misses); + ASSERT_FALSE(getline(mt_file, line)); + mt_file.close(); + ASSERT_OK(env_->DeleteFile(miss_timeline_path)); + } + } + } + { + // Validate the skewness csv file. + const std::string skewness_file_path = test_path_ + "/all_skewness"; + std::ifstream skew_file(skewness_file_path); + // Read header. + std::string line; + ASSERT_TRUE(getline(skew_file, line)); + std::stringstream ss(line); + double sum_percent = 0; + while (getline(skew_file, line)) { + std::stringstream ss_naccess(line); + std::string substr; + bool read_label = false; + while (ss_naccess.good()) { + ASSERT_TRUE(getline(ss_naccess, substr, ',')); + if (!read_label) { + read_label = true; + continue; + } + sum_percent += ParseDouble(substr); + } + } + ASSERT_EQ(100.0, sum_percent); + ASSERT_FALSE(getline(skew_file, line)); + skew_file.close(); + ASSERT_OK(env_->DeleteFile(skewness_file_path)); + } + { + // Validate the timeline csv files. + const std::vector<std::string> time_units{"_60", "_3600"}; + const std::vector<std::string> user_access_only_flags{"user_access_only_", + "all_access_"}; + for (auto const& user_access_only : user_access_only_flags) { + for (auto const& unit : time_units) { + std::stringstream ss(timeline_labels_); + while (ss.good()) { + std::string l; + ASSERT_TRUE(getline(ss, l, ',')); + if (l.find("block") == std::string::npos) { + if (user_access_only != "all_access_") { + continue; + } + } + const std::string timeline_file = test_path_ + "/" + + user_access_only + l + unit + + "_access_timeline"; + std::ifstream infile(timeline_file); + std::string line; + const uint64_t expected_naccesses = 50; + const uint64_t expected_user_accesses = 30; + ASSERT_TRUE(getline(infile, line)) << timeline_file; + uint32_t naccesses = 0; + while (getline(infile, line)) { + std::stringstream ss_naccess(line); + std::string substr; + bool read_label = false; + while (ss_naccess.good()) { + ASSERT_TRUE(getline(ss_naccess, substr, ',')); + if (!read_label) { + read_label = true; + continue; + } + naccesses += ParseUint32(substr); + } + } + if (user_access_only == "user_access_only_") { + ASSERT_EQ(expected_user_accesses, naccesses) << timeline_file; + } else { + ASSERT_EQ(expected_naccesses, naccesses) << timeline_file; + } + ASSERT_OK(env_->DeleteFile(timeline_file)); + } + } + } + } + { + // Validate the reuse_interval and reuse_distance csv files. + std::map<std::string, std::string> test_reuse_csv_files; + test_reuse_csv_files["_access_reuse_interval"] = reuse_interval_labels_; + test_reuse_csv_files["_reuse_distance"] = reuse_distance_labels_; + test_reuse_csv_files["_reuse_lifetime"] = reuse_lifetime_labels_; + test_reuse_csv_files["_avg_reuse_interval"] = reuse_interval_labels_; + test_reuse_csv_files["_avg_reuse_interval_naccesses"] = + reuse_interval_labels_; + for (auto const& test : test_reuse_csv_files) { + const std::string& file_suffix = test.first; + const std::string& labels = test.second; + const uint32_t expected_num_rows = 5; + std::stringstream ss(labels); + while (ss.good()) { + std::string l; + ASSERT_TRUE(getline(ss, l, ',')); + const std::string reuse_csv_file = test_path_ + "/" + l + file_suffix; + std::ifstream infile(reuse_csv_file); + std::string line; + ASSERT_TRUE(getline(infile, line)); + double npercentage = 0; + uint32_t nrows = 0; + while (getline(infile, line)) { + std::stringstream ss_naccess(line); + bool label_read = false; + nrows++; + while (ss_naccess.good()) { + std::string substr; + ASSERT_TRUE(getline(ss_naccess, substr, ',')); + if (!label_read) { + label_read = true; + continue; + } + npercentage += ParseDouble(substr); + } + } + ASSERT_EQ(expected_num_rows, nrows); + if ("_reuse_lifetime" == test.first || + "_avg_reuse_interval" == test.first || + "_avg_reuse_interval_naccesses" == test.first) { + ASSERT_EQ(100, npercentage) << reuse_csv_file; + } else { + ASSERT_LT(npercentage, 0); + } + ASSERT_OK(env_->DeleteFile(reuse_csv_file)); + } + } + } + + { + // Validate the percentage of accesses summary. + const std::string percent_access_summary_file = + test_path_ + "/percentage_of_accesses_summary"; + std::ifstream infile(percent_access_summary_file); + std::string line; + ASSERT_TRUE(getline(infile, line)); + std::set<std::string> callers; + std::set<std::string> expected_callers{"Get", "MultiGet", "Iterator", + "Prefetch", "Compaction"}; + while (getline(infile, line)) { + std::stringstream caller_percent(line); + std::string caller; + ASSERT_TRUE(getline(caller_percent, caller, ',')); + std::string percent; + ASSERT_TRUE(getline(caller_percent, percent, ',')); + ASSERT_FALSE(caller_percent.good()); + callers.insert(caller); + ASSERT_EQ(20, ParseDouble(percent)); + } + ASSERT_EQ(expected_callers.size(), callers.size()); + for (auto caller : callers) { + ASSERT_TRUE(expected_callers.find(caller) != expected_callers.end()); + } + ASSERT_OK(env_->DeleteFile(percent_access_summary_file)); + } + { + // Validate the percentage of accesses summary by analyzing callers. + std::stringstream analyzing_callers(analyzing_callers_); + while (analyzing_callers.good()) { + std::string caller; + ASSERT_TRUE(getline(analyzing_callers, caller, ',')); + std::vector<std::string> breakdowns{"level", "bt"}; + for (auto breakdown : breakdowns) { + const std::string file_name = test_path_ + "/" + caller + "_" + + breakdown + + "_percentage_of_accesses_summary"; + std::ifstream infile(file_name); + std::string line; + ASSERT_TRUE(getline(infile, line)); + double sum = 0; + while (getline(infile, line)) { + std::stringstream label_percent(line); + std::string label; + ASSERT_TRUE(getline(label_percent, label, ',')); + std::string percent; + ASSERT_TRUE(getline(label_percent, percent, ',')); + ASSERT_FALSE(label_percent.good()); + sum += ParseDouble(percent); + } + ASSERT_EQ(100, sum); + ASSERT_OK(env_->DeleteFile(file_name)); + } + } + } + const std::vector<std::string> access_types{"user_access_only", "all_access"}; + const std::vector<std::string> prefix{"bt", "cf"}; + for (auto const& pre : prefix) { + for (auto const& access_type : access_types) { + { + // Validate the access count summary. + const std::string bt_access_count_summary = test_path_ + "/" + pre + + "_" + access_type + + "_access_count_summary"; + std::ifstream infile(bt_access_count_summary); + std::string line; + ASSERT_TRUE(getline(infile, line)); + double sum_percent = 0; + while (getline(infile, line)) { + std::stringstream bt_percent(line); + std::string bt; + ASSERT_TRUE(getline(bt_percent, bt, ',')); + std::string percent; + ASSERT_TRUE(getline(bt_percent, percent, ',')); + sum_percent += ParseDouble(percent); + } + ASSERT_EQ(100.0, sum_percent); + ASSERT_OK(env_->DeleteFile(bt_access_count_summary)); + } + } + } + for (auto const& access_type : access_types) { + std::vector<std::string> block_types{"Index", "Data", "Filter"}; + for (auto block_type : block_types) { + // Validate reuse block timeline. + const std::string reuse_blocks_timeline = test_path_ + "/" + block_type + + "_" + access_type + + "_5_reuse_blocks_timeline"; + std::ifstream infile(reuse_blocks_timeline); + std::string line; + ASSERT_TRUE(getline(infile, line)) << reuse_blocks_timeline; + uint32_t index = 0; + while (getline(infile, line)) { + std::stringstream timeline(line); + bool start_time = false; + double sum = 0; + while (timeline.good()) { + std::string value; + ASSERT_TRUE(getline(timeline, value, ',')); + if (!start_time) { + start_time = true; + continue; + } + sum += ParseDouble(value); + } + index++; + ASSERT_LT(sum, 100.0 * index + 1) << reuse_blocks_timeline; + } + ASSERT_OK(env_->DeleteFile(reuse_blocks_timeline)); + } + } + + std::stringstream ss(analyze_get_spatial_locality_labels_); + while (ss.good()) { + std::string l; + ASSERT_TRUE(getline(ss, l, ',')); + const std::vector<std::string> spatial_locality_files{ + "_percent_ref_keys", "_percent_accesses_on_ref_keys", + "_percent_data_size_on_ref_keys"}; + for (auto const& spatial_locality_file : spatial_locality_files) { + const std::string filename = test_path_ + "/" + l + spatial_locality_file; + std::ifstream infile(filename); + std::string line; + ASSERT_TRUE(getline(infile, line)); + double sum_percent = 0; + uint32_t nrows = 0; + while (getline(infile, line)) { + std::stringstream bt_percent(line); + std::string bt; + ASSERT_TRUE(getline(bt_percent, bt, ',')); + std::string percent; + ASSERT_TRUE(getline(bt_percent, percent, ',')); + sum_percent += ParseDouble(percent); + nrows++; + } + ASSERT_EQ(11u, nrows); + ASSERT_EQ(100.0, sum_percent); + ASSERT_OK(env_->DeleteFile(filename)); + } + } + ASSERT_OK(env_->DeleteFile(block_cache_sim_config_path_)); +} + +TEST_F(BlockCacheTracerTest, MixedBlocks) { + { + // Generate a trace file containing a mix of blocks. + // It contains two SST files with 25 blocks of odd numbered block_key in + // kSSTStoringOddKeys and 25 blocks of even numbered blocks_key in + // kSSTStoringEvenKeys. + BlockCacheTraceWriterOptions trace_writer_opt; + std::unique_ptr<TraceWriter> trace_writer; + const auto& clock = env_->GetSystemClock(); + ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, + &trace_writer)); + std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer = + NewBlockCacheTraceWriter(clock.get(), trace_writer_opt, + std::move(trace_writer)); + ASSERT_NE(block_cache_trace_writer, nullptr); + ASSERT_OK(block_cache_trace_writer->WriteHeader()); + // Write blocks of different types. + WriteBlockAccess(block_cache_trace_writer.get(), 0, + TraceType::kBlockTraceUncompressionDictBlock, 10); + WriteBlockAccess(block_cache_trace_writer.get(), 10, + TraceType::kBlockTraceDataBlock, 10); + WriteBlockAccess(block_cache_trace_writer.get(), 20, + TraceType::kBlockTraceFilterBlock, 10); + WriteBlockAccess(block_cache_trace_writer.get(), 30, + TraceType::kBlockTraceIndexBlock, 10); + WriteBlockAccess(block_cache_trace_writer.get(), 40, + TraceType::kBlockTraceRangeDeletionBlock, 10); + ASSERT_OK(env_->FileExists(trace_file_path_)); + } + + { + // Verify trace file is generated correctly. + std::unique_ptr<TraceReader> trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_, + &trace_reader)); + BlockCacheTraceReader reader(std::move(trace_reader)); + BlockCacheTraceHeader header; + ASSERT_OK(reader.ReadHeader(&header)); + ASSERT_EQ(static_cast<uint32_t>(kMajorVersion), + header.rocksdb_major_version); + ASSERT_EQ(static_cast<uint32_t>(kMinorVersion), + header.rocksdb_minor_version); + // Read blocks. + BlockCacheTraceAnalyzer analyzer( + trace_file_path_, + /*output_miss_ratio_curve_path=*/"", + /*human_readable_trace_file_path=*/"", + /*compute_reuse_distance=*/true, + /*mrc_only=*/false, + /*is_block_cache_human_readable_trace=*/false, + /*simulator=*/nullptr); + // The analyzer ends when it detects an incomplete access record. + ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze()); + const uint64_t expected_num_cfs = 1; + std::vector<uint64_t> expected_fds{kSSTStoringOddKeys, kSSTStoringEvenKeys}; + const std::vector<TraceType> expected_types{ + TraceType::kBlockTraceUncompressionDictBlock, + TraceType::kBlockTraceDataBlock, TraceType::kBlockTraceFilterBlock, + TraceType::kBlockTraceIndexBlock, + TraceType::kBlockTraceRangeDeletionBlock}; + const uint64_t expected_num_keys_per_type = 5; + + auto& stats = analyzer.TEST_cf_aggregates_map(); + ASSERT_EQ(expected_num_cfs, stats.size()); + ASSERT_TRUE(stats.find(kDefaultColumnFamilyName) != stats.end()); + auto& cf_stats = stats.find(kDefaultColumnFamilyName)->second; + ASSERT_EQ(expected_fds.size(), cf_stats.fd_aggregates_map.size()); + for (auto fd_id : expected_fds) { + ASSERT_TRUE(cf_stats.fd_aggregates_map.find(fd_id) != + cf_stats.fd_aggregates_map.end()); + ASSERT_EQ(kLevel, cf_stats.fd_aggregates_map.find(fd_id)->second.level); + auto& block_type_aggregates_map = cf_stats.fd_aggregates_map.find(fd_id) + ->second.block_type_aggregates_map; + ASSERT_EQ(expected_types.size(), block_type_aggregates_map.size()); + uint32_t key_id = 0; + for (auto type : expected_types) { + ASSERT_TRUE(block_type_aggregates_map.find(type) != + block_type_aggregates_map.end()); + auto& block_access_info_map = + block_type_aggregates_map.find(type)->second.block_access_info_map; + // Each block type has 5 blocks. + ASSERT_EQ(expected_num_keys_per_type, block_access_info_map.size()); + for (uint32_t i = 0; i < 10; i++) { + // Verify that odd numbered blocks are stored in kSSTStoringOddKeys + // and even numbered blocks are stored in kSSTStoringEvenKeys. + auto key_id_str = kBlockKeyPrefix + std::to_string(key_id); + if (fd_id == kSSTStoringOddKeys) { + if (key_id % 2 == 1) { + AssertBlockAccessInfo(key_id, type, block_access_info_map); + } else { + ASSERT_TRUE(block_access_info_map.find(key_id_str) == + block_access_info_map.end()); + } + } else { + if (key_id % 2 == 1) { + ASSERT_TRUE(block_access_info_map.find(key_id_str) == + block_access_info_map.end()); + } else { + AssertBlockAccessInfo(key_id, type, block_access_info_map); + } + } + key_id++; + } + } + } + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +#endif // GFLAG +#else +#include <stdio.h> +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "block_cache_trace_analyzer_test is not supported in ROCKSDB_LITE\n"); + return 0; +} +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc new file mode 100644 index 000000000..44fec5598 --- /dev/null +++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else // GFLAGS +#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h" +int main(int argc, char** argv) { + return ROCKSDB_NAMESPACE::block_cache_trace_analyzer_tool(argc, argv); +} +#endif // GFLAGS +#else // ROCKSDB_LITE +#include <stdio.h> +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "Not supported in lite mode.\n"); + return 1; +} +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/check_all_python.py b/src/rocksdb/tools/check_all_python.py new file mode 100755 index 000000000..708339a67 --- /dev/null +++ b/src/rocksdb/tools/check_all_python.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import glob + +# Checks that all python files in the repository are at least free of syntax +# errors. This provides a minimal pre-/post-commit check for python file +# modifications. + +filenames = [] +# Avoid scanning all of ./ because there might be other external repos +# linked in. +for base in ["buckifier", "build_tools", "coverage", "tools"]: + # Clean this up when we finally upgrade to Python 3 + for suff in ["*", "*/*", "*/*/*"]: + filenames += glob.glob(base + "/" + suff + ".py") + +for filename in filenames: + source = open(filename, "r").read() + "\n" + # Parses and syntax checks the file, throwing on error. (No pyc written.) + _ = compile(source, filename, "exec") + +print("No syntax errors in {0} .py files".format(len(filenames))) diff --git a/src/rocksdb/tools/check_format_compatible.sh b/src/rocksdb/tools/check_format_compatible.sh new file mode 100755 index 000000000..8a3f1b379 --- /dev/null +++ b/src/rocksdb/tools/check_format_compatible.sh @@ -0,0 +1,379 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# A shell script to build and run different versions of ldb to check for +# expected forward and backward compatibility with "current" version. The +# working copy must have no uncommitted changes. +# +# Usage: <SCRIPT> [ref_for_current] +# `ref_for_current` can be a revision, tag, commit or branch name. Default is HEAD. +# +# Return value 0 means all regression tests pass. 1 if not pass. +# +# Environment options: +# SHORT_TEST=1 - Test only the oldest branch for each kind of test. This is +# a good choice for PR validation as it is relatively fast and will find +# most issues. +# USE_SSH=1 - Connect to GitHub with ssh instead of https + +if ! git diff-index --quiet HEAD; then + echo "You have uncommitted changes. Aborting." + exit 1 +fi + +current_checkout_name=${1:-HEAD} +# This allows the script to work even if with transient refs like "HEAD" +current_checkout_hash="$(git rev-parse --quiet --verify $current_checkout_name)" + +if [ "$current_checkout_hash" == "" ]; then + echo "Not a recognized ref: $current_checkout_name" + exit 1 +fi + +# To restore to prior branch at the end +orig_branch="$(git rev-parse --abbrev-ref HEAD)" +tmp_branch=_tmp_format_compatible +tmp_origin=_tmp_origin + +# Don't depend on what current "origin" might be +set -e +git remote remove $tmp_origin 2>/dev/null || true +if [ "$USE_SSH" ]; then + git remote add $tmp_origin "git@github.com:facebook/rocksdb.git" +else + git remote add $tmp_origin "https://github.com/facebook/rocksdb.git" +fi +git fetch $tmp_origin + +# Used in building some ancient RocksDB versions where by default it tries to +# use a precompiled libsnappy.a checked in to the repo. +export SNAPPY_LDFLAGS=-lsnappy + +cleanup() { + echo "== Cleaning up" + git reset --hard || true + git checkout "$orig_branch" || true + git branch -D $tmp_branch || true + git remote remove $tmp_origin || true +} +trap cleanup EXIT # Always clean up, even on failure or Ctrl+C + +scriptpath=`dirname ${BASH_SOURCE[0]}` + +test_dir=${TEST_TMPDIR:-"/tmp"}"/rocksdb_format_compatible_$USER" +rm -rf ${test_dir:?} + +# Prevent 'make clean' etc. from wiping out test_dir +export TEST_TMPDIR=$test_dir"/misc" + +# For saving current version of scripts as we checkout different versions to test +script_copy_dir=$test_dir"/script_copy" +mkdir -p $script_copy_dir +cp -f $scriptpath/*.sh $script_copy_dir + +# For shared raw input data +input_data_path=$test_dir"/test_data_input" +mkdir -p $input_data_path +# For external sst ingestion test +ext_test_dir=$test_dir"/ext" +mkdir -p $ext_test_dir +# For DB dump test +db_test_dir=$test_dir"/db" +mkdir -p $db_test_dir +# For backup/restore test (uses DB test) +bak_test_dir=$test_dir"/bak" +mkdir -p $bak_test_dir + +python_bin=$(which python3 || which python || echo python3) + +# Generate random files. +for i in {1..6} +do + input_data[$i]=$input_data_path/data$i + echo == Generating random input file ${input_data[$i]} + $python_bin - <<EOF +import random +random.seed($i) +symbols=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] +with open('${input_data[$i]}', 'w') as f: + for i in range(1,1024): + k = "" + for j in range(1, random.randint(1,32)): + k=k + symbols[random.randint(0, len(symbols) - 1)] + vb = "" + for j in range(1, random.randint(0,128)): + vb = vb + symbols[random.randint(0, len(symbols) - 1)] + v = "" + for j in range(1, random.randint(1, 5)): + v = v + vb + print(k + " ==> " + v, file=f) +EOF +done + +# Generate file(s) with sorted keys. +sorted_input_data=$input_data_path/sorted_data +echo == Generating file with sorted keys ${sorted_input_data} +$python_bin - <<EOF +with open('${sorted_input_data}', 'w') as f: + for i in range(0,10): + k = str(i) + v = "value" + k + print(k + " ==> " + v, file=f) +EOF + +# db_backward_only_refs defined below the rest + +# To check for DB forward compatibility with loading options (old version +# reading data from new), as well as backward compatibility +declare -a db_forward_with_options_refs=("6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb") +# To check for DB forward compatibility without loading options (in addition +# to the "with loading options" set), as well as backward compatibility +declare -a db_forward_no_options_refs=() # N/A at the moment + +# To check for SST ingestion backward compatibility (new version reading +# data from old) (ldb ingest_extern_sst added in 5.16.x, back-ported to +# 5.14.x, 5.15.x) +declare -a ext_backward_only_refs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb" "6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb" "6.14.fb" "6.15.fb" "6.16.fb" "6.17.fb" "6.18.fb" "6.19.fb" "6.20.fb" "6.21.fb" "6.22.fb" "6.23.fb" "6.24.fb" "6.25.fb" "6.26.fb") +# To check for SST ingestion forward compatibility (old version reading +# data from new) as well as backward compatibility +declare -a ext_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}") + +# To check for backup backward compatibility (new version reading data +# from old) (ldb backup/restore added in 4.11.x) +declare -a bak_backward_only_refs=("4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb" "5.11.fb" "5.12.fb" "5.13.fb" "${ext_backward_only_refs[@]}") +# To check for backup forward compatibility (old version reading data +# from new) as well as backward compatibility +declare -a bak_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}") + +# Branches (git refs) to check for DB backward compatibility (new version +# reading data from old) (in addition to the "forward compatible" list) +# NOTE: 2.7.fb.branch shows assertion violation in some configurations +declare -a db_backward_only_refs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "${bak_backward_only_refs[@]}") + +if [ "$SHORT_TEST" ]; then + # Use only the first (if exists) of each list + db_backward_only_refs=(${db_backward_only_refs[0]}) + db_forward_no_options_refs=(${db_forward_no_options_refs[0]}) + db_forward_with_options_refs=(${db_forward_with_options_refs[0]}) + ext_backward_only_refs=(${ext_backward_only_refs[0]}) + ext_forward_refs=(${ext_forward_refs[0]}) + bak_backward_only_refs=(${bak_backward_only_refs[0]}) + bak_forward_refs=(${bak_forward_refs[0]}) +fi + +# De-duplicate & accumulate +declare -a checkout_refs=() +for checkout_ref in "${db_backward_only_refs[@]}" "${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}" "${ext_backward_only_refs[@]}" "${ext_forward_refs[@]}" "${bak_backward_only_refs[@]}" "${bak_forward_refs[@]}" +do + if [ ! -e $db_test_dir/$checkout_ref ]; then + mkdir -p $db_test_dir/$checkout_ref + checkout_refs+=($checkout_ref) + fi +done + +generate_db() +{ + set +e + bash "$script_copy_dir"/generate_random_db.sh "$1" "$2" + if [ $? -ne 0 ]; then + echo ==== Error loading data from $2 to $1 ==== + exit 1 + fi + set -e +} + +compare_db() +{ + set +e + bash "$script_copy_dir"/verify_random_db.sh "$1" "$2" "$3" "$4" "$5" + if [ $? -ne 0 ]; then + echo ==== Read different content from $1 and $2 or error happened. ==== + exit 1 + fi + set -e +} + +write_external_sst() +{ + set +e + bash "$script_copy_dir"/write_external_sst.sh "$1" "$2" "$3" + if [ $? -ne 0 ]; then + echo ==== Error writing external SST file using data from $1 to $3 ==== + exit 1 + fi + set -e +} + +ingest_external_sst() +{ + set +e + bash "$script_copy_dir"/ingest_external_sst.sh "$1" "$2" + if [ $? -ne 0 ]; then + echo ==== Error ingesting external SST in $2 to DB at $1 ==== + exit 1 + fi + set -e +} + +backup_db() +{ + set +e + bash "$script_copy_dir"/backup_db.sh "$1" "$2" + if [ $? -ne 0 ]; then + echo ==== Error backing up DB $1 to $2 ==== + exit 1 + fi + set -e +} + +restore_db() +{ + set +e + bash "$script_copy_dir"/restore_db.sh "$1" "$2" + if [ $? -ne 0 ]; then + echo ==== Error restoring from $1 to $2 ==== + exit 1 + fi + set -e +} + +member_of_array() +{ + local e match="$1" + shift + for e; do [[ "$e" == "$match" ]] && return 0; done + return 1 +} + +force_no_fbcode() +{ + # Not all branches recognize ROCKSDB_NO_FBCODE and we should not need + # to patch old branches for changes to available FB compilers. + sed -i -e 's|-d /mnt/gvfs/third-party|"$ROCKSDB_FORCE_FBCODE"|' build_tools/build_detect_platform +} + +# General structure from here: +# * Check out, build, and do stuff with the "current" branch. +# * For each older branch under consideration, +# * Check out, build, and do stuff with it, potentially using data +# generated from "current" branch. +# * (Again) check out, build, and do (other) stuff with the "current" +# branch, potentially using data from older branches. +# +# This way, we only do at most n+1 checkout+build steps, without the +# need to stash away executables. + +# Decorate name +current_checkout_name="$current_checkout_name ($current_checkout_hash)" + +echo "== Building $current_checkout_name debug" +git checkout -B $tmp_branch $current_checkout_hash +force_no_fbcode +make clean +DISABLE_WARNING_AS_ERROR=1 make ldb -j32 + +echo "== Using $current_checkout_name, generate DB with extern SST and ingest" +current_ext_test_dir=$ext_test_dir"/current" +write_external_sst $input_data_path ${current_ext_test_dir}_pointless $current_ext_test_dir +ingest_external_sst ${current_ext_test_dir}_ingest $current_ext_test_dir + +echo "== Generating DB from $current_checkout_name ..." +current_db_test_dir=$db_test_dir"/current" +generate_db $input_data_path $current_db_test_dir + +echo "== Creating backup of DB from $current_checkout_name ..." +current_bak_test_dir=$bak_test_dir"/current" +backup_db $current_db_test_dir $current_bak_test_dir + +for checkout_ref in "${checkout_refs[@]}" +do + echo "== Building $checkout_ref debug" + git reset --hard $tmp_origin/$checkout_ref + force_no_fbcode + make clean + DISABLE_WARNING_AS_ERROR=1 make ldb -j32 + + # We currently assume DB backward compatibility for every branch listed + echo "== Use $checkout_ref to generate a DB ..." + generate_db $input_data_path $db_test_dir/$checkout_ref + + if member_of_array "$checkout_ref" "${ext_backward_only_refs[@]}" || + member_of_array "$checkout_ref" "${ext_forward_refs[@]}" + then + echo "== Use $checkout_ref to generate DB with extern SST file" + write_external_sst $input_data_path $ext_test_dir/${checkout_ref}_pointless $ext_test_dir/$checkout_ref + fi + + if member_of_array "$checkout_ref" "${ext_forward_refs[@]}" + then + echo "== Use $checkout_ref to ingest extern SST file and compare vs. $current_checkout_name" + ingest_external_sst $ext_test_dir/${checkout_ref}_ingest $ext_test_dir/$checkout_ref + compare_db $ext_test_dir/${checkout_ref}_ingest ${current_ext_test_dir}_ingest db_dump.txt 1 1 + + rm -rf ${ext_test_dir:?}/${checkout_ref}_ingest + echo "== Use $checkout_ref to ingest extern SST file from $current_checkout_name" + ingest_external_sst $ext_test_dir/${checkout_ref}_ingest $current_ext_test_dir + compare_db $ext_test_dir/${checkout_ref}_ingest ${current_ext_test_dir}_ingest db_dump.txt 1 1 + fi + + if member_of_array "$checkout_ref" "${db_forward_no_options_refs[@]}" || + member_of_array "$checkout_ref" "${db_forward_with_options_refs[@]}" + then + echo "== Use $checkout_ref to open DB generated using $current_checkout_name..." + compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 0 + fi + + if member_of_array "$checkout_ref" "${db_forward_with_options_refs[@]}" + then + echo "== Use $checkout_ref to open DB generated using $current_checkout_name with its options..." + compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 1 1 + fi + + if member_of_array "$checkout_ref" "${bak_backward_only_refs[@]}" || + member_of_array "$checkout_ref" "${bak_forward_refs[@]}" + then + echo "== Use $checkout_ref to backup DB" + backup_db $db_test_dir/$checkout_ref $bak_test_dir/$checkout_ref + fi + + if member_of_array "$checkout_ref" "${bak_forward_refs[@]}" + then + echo "== Use $checkout_ref to restore DB from $current_checkout_name" + rm -rf ${db_test_dir:?}/$checkout_ref + restore_db $current_bak_test_dir $db_test_dir/$checkout_ref + compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 0 + fi +done + +echo "== Building $current_checkout_name debug (again, final)" +git reset --hard $current_checkout_hash +force_no_fbcode +make clean +DISABLE_WARNING_AS_ERROR=1 make ldb -j32 + +for checkout_ref in "${checkout_refs[@]}" +do + # We currently assume DB backward compatibility for every branch listed + echo "== Use $current_checkout_name to open DB generated using $checkout_ref..." + compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump.txt 1 0 + + if member_of_array "$checkout_ref" "${ext_backward_only_refs[@]}" || + member_of_array "$checkout_ref" "${ext_forward_refs[@]}" + then + rm -rf ${ext_test_dir:?}/${checkout_ref}_ingest + echo "== Use $current_checkout_name to ingest extern SST file from $checkout_ref" + ingest_external_sst $ext_test_dir/${checkout_ref}_ingest $current_ext_test_dir + compare_db $ext_test_dir/${checkout_ref}_ingest ${current_ext_test_dir}_ingest db_dump.txt 1 1 + fi + + if member_of_array "$checkout_ref" "${bak_backward_only_refs[@]}" || + member_of_array "$checkout_ref" "${bak_forward_refs[@]}" + then + echo "== Use $current_checkout_name to restore DB from $checkout_ref" + rm -rf ${db_test_dir:?}/$checkout_ref + restore_db $bak_test_dir/$checkout_ref $db_test_dir/$checkout_ref + compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump.txt 1 0 + fi +done + +echo ==== Compatibility Test PASSED ==== diff --git a/src/rocksdb/tools/db_bench.cc b/src/rocksdb/tools/db_bench.cc new file mode 100644 index 000000000..f13de83fe --- /dev/null +++ b/src/rocksdb/tools/db_bench.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else +#include "rocksdb/db_bench_tool.h" +int main(int argc, char** argv) { + return ROCKSDB_NAMESPACE::db_bench_tool(argc, argv); +} +#endif // GFLAGS diff --git a/src/rocksdb/tools/db_bench_tool.cc b/src/rocksdb/tools/db_bench_tool.cc new file mode 100644 index 000000000..7182528b3 --- /dev/null +++ b/src/rocksdb/tools/db_bench_tool.cc @@ -0,0 +1,8707 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifdef GFLAGS +#ifdef NUMA +#include <numa.h> +#endif +#ifndef OS_WIN +#include <unistd.h> +#endif +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#ifdef __APPLE__ +#include <mach/host_info.h> +#include <mach/mach_host.h> +#include <sys/sysctl.h> +#endif +#ifdef __FreeBSD__ +#include <sys/sysctl.h> +#endif +#include <atomic> +#include <cinttypes> +#include <condition_variable> +#include <cstddef> +#include <iostream> +#include <memory> +#include <mutex> +#include <queue> +#include <thread> +#include <unordered_map> + +#include "db/db_impl/db_impl.h" +#include "db/malloc_stats.h" +#include "db/version_set.h" +#include "monitoring/histogram.h" +#include "monitoring/statistics.h" +#include "options/cf_options.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/persistent_cache.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/secondary_cache.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/stats_history.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/backup_engine.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/options_type.h" +#include "rocksdb/utilities/options_util.h" +#ifndef ROCKSDB_LITE +#include "rocksdb/utilities/replayer.h" +#endif // ROCKSDB_LITE +#include "rocksdb/utilities/sim_cache.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/write_batch.h" +#include "test_util/testutil.h" +#include "test_util/transaction_test_util.h" +#include "tools/simulated_hybrid_file_system.h" +#include "util/cast_util.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/file_checksum_helper.h" +#include "util/gflags_compat.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/stderr_logger.h" +#include "util/string_util.h" +#include "util/xxhash.h" +#include "utilities/blob_db/blob_db.h" +#include "utilities/counted_fs.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/bytesxor.h" +#include "utilities/merge_operators/sortlist.h" +#include "utilities/persistent_cache/block_cache_tier.h" + +#ifdef MEMKIND +#include "memory/memkind_kmem_allocator.h" +#endif + +#ifdef OS_WIN +#include <io.h> // open/close +#endif + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +using GFLAGS_NAMESPACE::RegisterFlagValidator; +using GFLAGS_NAMESPACE::SetUsageMessage; +using GFLAGS_NAMESPACE::SetVersionString; + +#ifdef ROCKSDB_LITE +#define IF_ROCKSDB_LITE(Then, Else) Then +#else +#define IF_ROCKSDB_LITE(Then, Else) Else +#endif + +DEFINE_string( + benchmarks, + "fillseq," + "fillseqdeterministic," + "fillsync," + "fillrandom," + "filluniquerandomdeterministic," + "overwrite," + "readrandom," + "newiterator," + "newiteratorwhilewriting," + "seekrandom," + "seekrandomwhilewriting," + "seekrandomwhilemerging," + "readseq," + "readreverse," + "compact," + "compactall," + "flush," +IF_ROCKSDB_LITE("", + "compact0," + "compact1," + "waitforcompaction," +) + "multireadrandom," + "mixgraph," + "readseq," + "readtorowcache," + "readtocache," + "readreverse," + "readwhilewriting," + "readwhilemerging," + "readwhilescanning," + "readrandomwriterandom," + "updaterandom," + "xorupdaterandom," + "approximatesizerandom," + "randomwithverify," + "fill100K," + "crc32c," + "xxhash," + "xxhash64," + "xxh3," + "compress," + "uncompress," + "acquireload," + "fillseekseq," + "randomtransaction," + "randomreplacekeys," + "timeseries," + "getmergeoperands,", + "readrandomoperands," + "backup," + "restore" + + "Comma-separated list of operations to run in the specified" + " order. Available benchmarks:\n" + "\tfillseq -- write N values in sequential key" + " order in async mode\n" + "\tfillseqdeterministic -- write N values in the specified" + " key order and keep the shape of the LSM tree\n" + "\tfillrandom -- write N values in random key order in async" + " mode\n" + "\tfilluniquerandomdeterministic -- write N values in a random" + " key order and keep the shape of the LSM tree\n" + "\toverwrite -- overwrite N values in random key order in " + "async mode\n" + "\tfillsync -- write N/1000 values in random key order in " + "sync mode\n" + "\tfill100K -- write N/1000 100K values in random order in" + " async mode\n" + "\tdeleteseq -- delete N keys in sequential order\n" + "\tdeleterandom -- delete N keys in random order\n" + "\treadseq -- read N times sequentially\n" + "\treadtocache -- 1 thread reading database sequentially\n" + "\treadreverse -- read N times in reverse order\n" + "\treadrandom -- read N times in random order\n" + "\treadmissing -- read N missing keys in random order\n" + "\treadwhilewriting -- 1 writer, N threads doing random " + "reads\n" + "\treadwhilemerging -- 1 merger, N threads doing random " + "reads\n" + "\treadwhilescanning -- 1 thread doing full table scan, " + "N threads doing random reads\n" + "\treadrandomwriterandom -- N threads doing random-read, " + "random-write\n" + "\tupdaterandom -- N threads doing read-modify-write for random " + "keys\n" + "\txorupdaterandom -- N threads doing read-XOR-write for " + "random keys\n" + "\tappendrandom -- N threads doing read-modify-write with " + "growing values\n" + "\tmergerandom -- same as updaterandom/appendrandom using merge" + " operator. " + "Must be used with merge_operator\n" + "\treadrandommergerandom -- perform N random read-or-merge " + "operations. Must be used with merge_operator\n" + "\tnewiterator -- repeated iterator creation\n" + "\tseekrandom -- N random seeks, call Next seek_nexts times " + "per seek\n" + "\tseekrandomwhilewriting -- seekrandom and 1 thread doing " + "overwrite\n" + "\tseekrandomwhilemerging -- seekrandom and 1 thread doing " + "merge\n" + "\tcrc32c -- repeated crc32c of <block size> data\n" + "\txxhash -- repeated xxHash of <block size> data\n" + "\txxhash64 -- repeated xxHash64 of <block size> data\n" + "\txxh3 -- repeated XXH3 of <block size> data\n" + "\tacquireload -- load N*1000 times\n" + "\tfillseekseq -- write N values in sequential key, then read " + "them by seeking to each key\n" + "\trandomtransaction -- execute N random transactions and " + "verify correctness\n" + "\trandomreplacekeys -- randomly replaces N keys by deleting " + "the old version and putting the new version\n\n" + "\ttimeseries -- 1 writer generates time series data " + "and multiple readers doing random reads on id\n\n" + "Meta operations:\n" + "\tcompact -- Compact the entire DB; If multiple, randomly choose one\n" + "\tcompactall -- Compact the entire DB\n" +IF_ROCKSDB_LITE("", + "\tcompact0 -- compact L0 into L1\n" + "\tcompact1 -- compact L1 into L2\n" + "\twaitforcompaction - pause until compaction is (probably) done\n" +) + "\tflush - flush the memtable\n" + "\tstats -- Print DB stats\n" + "\tresetstats -- Reset DB stats\n" + "\tlevelstats -- Print the number of files and bytes per level\n" + "\tmemstats -- Print memtable stats\n" + "\tsstables -- Print sstable info\n" + "\theapprofile -- Dump a heap profile (if supported by this port)\n" +IF_ROCKSDB_LITE("", + "\treplay -- replay the trace file specified with trace_file\n" +) + "\tgetmergeoperands -- Insert lots of merge records which are a list of " + "sorted ints for a key and then compare performance of lookup for another " + "key by doing a Get followed by binary searching in the large sorted list " + "vs doing a GetMergeOperands and binary searching in the operands which " + "are sorted sub-lists. The MergeOperator used is sortlist.h\n" + "\treadrandomoperands -- read random keys using `GetMergeOperands()`. An " + "operation includes a rare but possible retry in case it got " + "`Status::Incomplete()`. This happens upon encountering more keys than " + "have ever been seen by the thread (or eight initially)\n" + "\tbackup -- Create a backup of the current DB and verify that a new backup is corrected. " + "Rate limit can be specified through --backup_rate_limit\n" + "\trestore -- Restore the DB from the latest backup available, rate limit can be specified through --restore_rate_limit\n"); + +DEFINE_int64(num, 1000000, "Number of key/values to place in database"); + +DEFINE_int64(numdistinct, 1000, + "Number of distinct keys to use. Used in RandomWithVerify to " + "read/write on fewer keys so that gets are more likely to find the" + " key and puts are more likely to update the same key"); + +DEFINE_int64(merge_keys, -1, + "Number of distinct keys to use for MergeRandom and " + "ReadRandomMergeRandom. " + "If negative, there will be FLAGS_num keys."); +DEFINE_int32(num_column_families, 1, "Number of Column Families to use."); + +DEFINE_int32( + num_hot_column_families, 0, + "Number of Hot Column Families. If more than 0, only write to this " + "number of column families. After finishing all the writes to them, " + "create new set of column families and insert to them. Only used " + "when num_column_families > 1."); + +DEFINE_string(column_family_distribution, "", + "Comma-separated list of percentages, where the ith element " + "indicates the probability of an op using the ith column family. " + "The number of elements must be `num_hot_column_families` if " + "specified; otherwise, it must be `num_column_families`. The " + "sum of elements must be 100. E.g., if `num_column_families=4`, " + "and `num_hot_column_families=0`, a valid list could be " + "\"10,20,30,40\"."); + +DEFINE_int64(reads, -1, + "Number of read operations to do. " + "If negative, do FLAGS_num reads."); + +DEFINE_int64(deletes, -1, + "Number of delete operations to do. " + "If negative, do FLAGS_num deletions."); + +DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality"); + +DEFINE_int64(seed, 0, + "Seed base for random number generators. " + "When 0 it is derived from the current time."); +static int64_t seed_base; + +DEFINE_int32(threads, 1, "Number of concurrent threads to run."); + +DEFINE_int32(duration, 0, + "Time in seconds for the random-ops tests to run." + " When 0 then num & reads determine the test duration"); + +DEFINE_string(value_size_distribution_type, "fixed", + "Value size distribution type: fixed, uniform, normal"); + +DEFINE_int32(value_size, 100, "Size of each value in fixed distribution"); +static unsigned int value_size = 100; + +DEFINE_int32(value_size_min, 100, "Min size of random value"); + +DEFINE_int32(value_size_max, 102400, "Max size of random value"); + +DEFINE_int32(seek_nexts, 0, + "How many times to call Next() after Seek() in " + "fillseekseq, seekrandom, seekrandomwhilewriting and " + "seekrandomwhilemerging"); + +DEFINE_bool(reverse_iterator, false, + "When true use Prev rather than Next for iterators that do " + "Seek and then Next"); + +DEFINE_bool(auto_prefix_mode, false, "Set auto_prefix_mode for seek benchmark"); + +DEFINE_int64(max_scan_distance, 0, + "Used to define iterate_upper_bound (or iterate_lower_bound " + "if FLAGS_reverse_iterator is set to true) when value is nonzero"); + +DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator"); + +DEFINE_int64(batch_size, 1, "Batch size"); + +static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) { + return true; +} + +static bool ValidateUint32Range(const char* flagname, uint64_t value) { + if (value > std::numeric_limits<uint32_t>::max()) { + fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname, + (unsigned long)value); + return false; + } + return true; +} + +DEFINE_int32(key_size, 16, "size of each key"); + +DEFINE_int32(user_timestamp_size, 0, + "number of bytes in a user-defined timestamp"); + +DEFINE_int32(num_multi_db, 0, + "Number of DBs used in the benchmark. 0 means single DB."); + +DEFINE_double(compression_ratio, 0.5, + "Arrange to generate values that shrink to this fraction of " + "their original size after compression"); + +DEFINE_double( + overwrite_probability, 0.0, + "Used in 'filluniquerandom' benchmark: for each write operation, " + "we give a probability to perform an overwrite instead. The key used for " + "the overwrite is randomly chosen from the last 'overwrite_window_size' " + "keys previously inserted into the DB. " + "Valid overwrite_probability values: [0.0, 1.0]."); + +DEFINE_uint32(overwrite_window_size, 1, + "Used in 'filluniquerandom' benchmark. For each write operation," + " when the overwrite_probability flag is set by the user, the " + "key used to perform an overwrite is randomly chosen from the " + "last 'overwrite_window_size' keys previously inserted into DB. " + "Warning: large values can affect throughput. " + "Valid overwrite_window_size values: [1, kMaxUint32]."); + +DEFINE_uint64( + disposable_entries_delete_delay, 0, + "Minimum delay in microseconds for the series of Deletes " + "to be issued. When 0 the insertion of the last disposable entry is " + "immediately followed by the issuance of the Deletes. " + "(only compatible with fillanddeleteuniquerandom benchmark)."); + +DEFINE_uint64(disposable_entries_batch_size, 0, + "Number of consecutively inserted disposable KV entries " + "that will be deleted after 'delete_delay' microseconds. " + "A series of Deletes is always issued once all the " + "disposable KV entries it targets have been inserted " + "into the DB. When 0 no deletes are issued and a " + "regular 'filluniquerandom' benchmark occurs. " + "(only compatible with fillanddeleteuniquerandom benchmark)"); + +DEFINE_int32(disposable_entries_value_size, 64, + "Size of the values (in bytes) of the entries targeted by " + "selective deletes. " + "(only compatible with fillanddeleteuniquerandom benchmark)"); + +DEFINE_uint64( + persistent_entries_batch_size, 0, + "Number of KV entries being inserted right before the deletes " + "targeting the disposable KV entries are issued. These " + "persistent keys are not targeted by the deletes, and will always " + "remain valid in the DB. (only compatible with " + "--benchmarks='fillanddeleteuniquerandom' " + "and used when--disposable_entries_batch_size is > 0)."); + +DEFINE_int32(persistent_entries_value_size, 64, + "Size of the values (in bytes) of the entries not targeted by " + "deletes. (only compatible with " + "--benchmarks='fillanddeleteuniquerandom' " + "and used when--disposable_entries_batch_size is > 0)."); + +DEFINE_double(read_random_exp_range, 0.0, + "Read random's key will be generated using distribution of " + "num * exp(-r) where r is uniform number from 0 to this value. " + "The larger the number is, the more skewed the reads are. " + "Only used in readrandom and multireadrandom benchmarks."); + +DEFINE_bool(histogram, false, "Print histogram of operation timings"); + +DEFINE_bool(confidence_interval_only, false, + "Print 95% confidence interval upper and lower bounds only for " + "aggregate stats."); + +DEFINE_bool(enable_numa, false, + "Make operations aware of NUMA architecture and bind memory " + "and cpus corresponding to nodes together. In NUMA, memory " + "in same node as CPUs are closer when compared to memory in " + "other nodes. Reads can be faster when the process is bound to " + "CPU and memory of same node. Use \"$numactl --hardware\" command " + "to see NUMA memory architecture."); + +DEFINE_int64(db_write_buffer_size, + ROCKSDB_NAMESPACE::Options().db_write_buffer_size, + "Number of bytes to buffer in all memtables before compacting"); + +DEFINE_bool(cost_write_buffer_to_cache, false, + "The usage of memtable is costed to the block cache"); + +DEFINE_int64(arena_block_size, ROCKSDB_NAMESPACE::Options().arena_block_size, + "The size, in bytes, of one block in arena memory allocation."); + +DEFINE_int64(write_buffer_size, ROCKSDB_NAMESPACE::Options().write_buffer_size, + "Number of bytes to buffer in memtable before compacting"); + +DEFINE_int32(max_write_buffer_number, + ROCKSDB_NAMESPACE::Options().max_write_buffer_number, + "The number of in-memory memtables. Each memtable is of size" + " write_buffer_size bytes."); + +DEFINE_int32(min_write_buffer_number_to_merge, + ROCKSDB_NAMESPACE::Options().min_write_buffer_number_to_merge, + "The minimum number of write buffers that will be merged together" + "before writing to storage. This is cheap because it is an" + "in-memory merge. If this feature is not enabled, then all these" + "write buffers are flushed to L0 as separate files and this " + "increases read amplification because a get request has to check" + " in all of these files. Also, an in-memory merge may result in" + " writing less data to storage if there are duplicate records " + " in each of these individual write buffers."); + +DEFINE_int32(max_write_buffer_number_to_maintain, + ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain, + "The total maximum number of write buffers to maintain in memory " + "including copies of buffers that have already been flushed. " + "Unlike max_write_buffer_number, this parameter does not affect " + "flushing. This controls the minimum amount of write history " + "that will be available in memory for conflict checking when " + "Transactions are used. If this value is too low, some " + "transactions may fail at commit time due to not being able to " + "determine whether there were any write conflicts. Setting this " + "value to 0 will cause write buffers to be freed immediately " + "after they are flushed. If this value is set to -1, " + "'max_write_buffer_number' will be used."); + +DEFINE_int64(max_write_buffer_size_to_maintain, + ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain, + "The total maximum size of write buffers to maintain in memory " + "including copies of buffers that have already been flushed. " + "Unlike max_write_buffer_number, this parameter does not affect " + "flushing. This controls the minimum amount of write history " + "that will be available in memory for conflict checking when " + "Transactions are used. If this value is too low, some " + "transactions may fail at commit time due to not being able to " + "determine whether there were any write conflicts. Setting this " + "value to 0 will cause write buffers to be freed immediately " + "after they are flushed. If this value is set to -1, " + "'max_write_buffer_number' will be used."); + +DEFINE_int32(max_background_jobs, + ROCKSDB_NAMESPACE::Options().max_background_jobs, + "The maximum number of concurrent background jobs that can occur " + "in parallel."); + +DEFINE_int32(num_bottom_pri_threads, 0, + "The number of threads in the bottom-priority thread pool (used " + "by universal compaction only)."); + +DEFINE_int32(num_high_pri_threads, 0, + "The maximum number of concurrent background compactions" + " that can occur in parallel."); + +DEFINE_int32(num_low_pri_threads, 0, + "The maximum number of concurrent background compactions" + " that can occur in parallel."); + +DEFINE_int32(max_background_compactions, + ROCKSDB_NAMESPACE::Options().max_background_compactions, + "The maximum number of concurrent background compactions" + " that can occur in parallel."); + +DEFINE_uint64(subcompactions, 1, + "Maximum number of subcompactions to divide L0-L1 compactions " + "into."); +static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) = + RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range); + +DEFINE_int32(max_background_flushes, + ROCKSDB_NAMESPACE::Options().max_background_flushes, + "The maximum number of concurrent background flushes" + " that can occur in parallel."); + +static ROCKSDB_NAMESPACE::CompactionStyle FLAGS_compaction_style_e; +DEFINE_int32(compaction_style, + (int32_t)ROCKSDB_NAMESPACE::Options().compaction_style, + "style of compaction: level-based, universal and fifo"); + +static ROCKSDB_NAMESPACE::CompactionPri FLAGS_compaction_pri_e; +DEFINE_int32(compaction_pri, + (int32_t)ROCKSDB_NAMESPACE::Options().compaction_pri, + "priority of files to compaction: by size or by data age"); + +DEFINE_int32(universal_size_ratio, 0, + "Percentage flexibility while comparing file size " + "(for universal compaction only)."); + +DEFINE_int32(universal_min_merge_width, 0, + "The minimum number of files in a single compaction run " + "(for universal compaction only)."); + +DEFINE_int32(universal_max_merge_width, 0, + "The max number of files to compact in universal style " + "compaction"); + +DEFINE_int32(universal_max_size_amplification_percent, 0, + "The max size amplification for universal style compaction"); + +DEFINE_int32(universal_compression_size_percent, -1, + "The percentage of the database to compress for universal " + "compaction. -1 means compress everything."); + +DEFINE_bool(universal_allow_trivial_move, false, + "Allow trivial move in universal compaction."); + +DEFINE_bool(universal_incremental, false, + "Enable incremental compactions in universal compaction."); + +DEFINE_int64(cache_size, 8 << 20, // 8MB + "Number of bytes to use as a cache of uncompressed data"); + +DEFINE_int32(cache_numshardbits, -1, + "Number of shards for the block cache" + " is 2 ** cache_numshardbits. Negative means use default settings." + " This is applied only if FLAGS_cache_size is non-negative."); + +DEFINE_double(cache_high_pri_pool_ratio, 0.0, + "Ratio of block cache reserve for high pri blocks. " + "If > 0.0, we also enable " + "cache_index_and_filter_blocks_with_high_priority."); + +DEFINE_double(cache_low_pri_pool_ratio, 0.0, + "Ratio of block cache reserve for low pri blocks."); + +DEFINE_string(cache_type, "lru_cache", "Type of block cache."); + +DEFINE_bool(use_compressed_secondary_cache, false, + "Use the CompressedSecondaryCache as the secondary cache."); + +DEFINE_int64(compressed_secondary_cache_size, 8 << 20, // 8MB + "Number of bytes to use as a cache of data"); + +DEFINE_int32(compressed_secondary_cache_numshardbits, 6, + "Number of shards for the block cache" + " is 2 ** compressed_secondary_cache_numshardbits." + " Negative means use default settings." + " This is applied only if FLAGS_cache_size is non-negative."); + +DEFINE_double(compressed_secondary_cache_high_pri_pool_ratio, 0.0, + "Ratio of block cache reserve for high pri blocks. " + "If > 0.0, we also enable " + "cache_index_and_filter_blocks_with_high_priority."); + +DEFINE_double(compressed_secondary_cache_low_pri_pool_ratio, 0.0, + "Ratio of block cache reserve for low pri blocks."); + +DEFINE_string(compressed_secondary_cache_compression_type, "lz4", + "The compression algorithm to use for large " + "values stored in CompressedSecondaryCache."); +static enum ROCKSDB_NAMESPACE::CompressionType + FLAGS_compressed_secondary_cache_compression_type_e = + ROCKSDB_NAMESPACE::kLZ4Compression; + +DEFINE_uint32( + compressed_secondary_cache_compress_format_version, 2, + "compress_format_version can have two values: " + "compress_format_version == 1 -- decompressed size is not included" + " in the block header." + "compress_format_version == 2 -- decompressed size is included" + " in the block header in varint32 format."); + +DEFINE_int64(simcache_size, -1, + "Number of bytes to use as a simcache of " + "uncompressed data. Nagative value disables simcache."); + +DEFINE_bool(cache_index_and_filter_blocks, false, + "Cache index/filter blocks in block cache."); + +DEFINE_bool(use_cache_jemalloc_no_dump_allocator, false, + "Use JemallocNodumpAllocator for block/blob cache."); + +DEFINE_bool(use_cache_memkind_kmem_allocator, false, + "Use memkind kmem allocator for block/blob cache."); + +DEFINE_bool(partition_index_and_filters, false, + "Partition index and filter blocks."); + +DEFINE_bool(partition_index, false, "Partition index blocks"); + +DEFINE_bool(index_with_first_key, false, "Include first key in the index"); + +DEFINE_bool( + optimize_filters_for_memory, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory, + "Minimize memory footprint of filters"); + +DEFINE_int64( + index_shortening_mode, 2, + "mode to shorten index: 0 for no shortening; 1 for only shortening " + "separaters; 2 for shortening shortening and successor"); + +DEFINE_int64(metadata_block_size, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().metadata_block_size, + "Max partition size when partitioning index/filters"); + +// The default reduces the overhead of reading time with flash. With HDD, which +// offers much less throughput, however, this number better to be set to 1. +DEFINE_int32(ops_between_duration_checks, 1000, + "Check duration limit every x ops"); + +DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false, + "Pin index/filter blocks of L0 files in block cache."); + +DEFINE_bool( + pin_top_level_index_and_filter, false, + "Pin top-level index of partitioned index/filter blocks in block cache."); + +DEFINE_int32(block_size, + static_cast<int32_t>( + ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size), + "Number of bytes in a block."); + +DEFINE_int32(format_version, + static_cast<int32_t>( + ROCKSDB_NAMESPACE::BlockBasedTableOptions().format_version), + "Format version of SST files."); + +DEFINE_int32(block_restart_interval, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_restart_interval, + "Number of keys between restart points " + "for delta encoding of keys in data block."); + +DEFINE_int32( + index_block_restart_interval, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_block_restart_interval, + "Number of keys between restart points " + "for delta encoding of keys in index block."); + +DEFINE_int32(read_amp_bytes_per_bit, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().read_amp_bytes_per_bit, + "Number of bytes per bit to be used in block read-amp bitmap"); + +DEFINE_bool( + enable_index_compression, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().enable_index_compression, + "Compress the index block"); + +DEFINE_bool(block_align, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align, + "Align data blocks on page size"); + +DEFINE_int64(prepopulate_block_cache, 0, + "Pre-populate hot/warm blocks in block cache. 0 to disable and 1 " + "to insert during flush"); + +DEFINE_bool(use_data_block_hash_index, false, + "if use kDataBlockBinaryAndHash " + "instead of kDataBlockBinarySearch. " + "This is valid if only we use BlockTable"); + +DEFINE_double(data_block_hash_table_util_ratio, 0.75, + "util ratio for data block hash index table. " + "This is only valid if use_data_block_hash_index is " + "set to true"); + +DEFINE_int64(compressed_cache_size, -1, + "Number of bytes to use as a cache of compressed data."); + +DEFINE_int64(row_cache_size, 0, + "Number of bytes to use as a cache of individual rows" + " (0 = disabled)."); + +DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files, + "Maximum number of files to keep open at the same time" + " (use default if == 0)"); + +DEFINE_int32(file_opening_threads, + ROCKSDB_NAMESPACE::Options().max_file_opening_threads, + "If open_files is set to -1, this option set the number of " + "threads that will be used to open files during DB::Open()"); + +DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size"); + +DEFINE_int32(log_readahead_size, 0, "WAL and manifest readahead size"); + +DEFINE_int32(random_access_max_buffer_size, 1024 * 1024, + "Maximum windows randomaccess buffer size"); + +DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024, + "Maximum write buffer for Writable File"); + +DEFINE_int32(bloom_bits, -1, + "Bloom filter bits per key. Negative means use default." + "Zero disables."); + +DEFINE_bool(use_ribbon_filter, false, "Use Ribbon instead of Bloom filter"); + +DEFINE_double(memtable_bloom_size_ratio, 0, + "Ratio of memtable size used for bloom filter. 0 means no bloom " + "filter."); +DEFINE_bool(memtable_whole_key_filtering, false, + "Try to use whole key bloom filter in memtables."); +DEFINE_bool(memtable_use_huge_page, false, + "Try to use huge page in memtables."); + +DEFINE_bool(whole_key_filtering, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().whole_key_filtering, + "Use whole keys (in addition to prefixes) in SST bloom filter."); + +DEFINE_bool(use_existing_db, false, + "If true, do not destroy the existing database. If you set this " + "flag and also specify a benchmark that wants a fresh database, " + "that benchmark will fail."); + +DEFINE_bool(use_existing_keys, false, + "If true, uses existing keys in the DB, " + "rather than generating new ones. This involves some startup " + "latency to load all keys into memory. It is supported for the " + "same read/overwrite benchmarks as `-use_existing_db=true`, which " + "must also be set for this flag to be enabled. When this flag is " + "set, the value for `-num` will be ignored."); + +DEFINE_bool(show_table_properties, false, + "If true, then per-level table" + " properties will be printed on every stats-interval when" + " stats_interval is set and stats_per_interval is on."); + +DEFINE_string(db, "", "Use the db with the following name."); + +DEFINE_bool(progress_reports, true, + "If true, db_bench will report number of finished operations."); + +// Read cache flags + +DEFINE_string(read_cache_path, "", + "If not empty string, a read cache will be used in this path"); + +DEFINE_int64(read_cache_size, 4LL * 1024 * 1024 * 1024, + "Maximum size of the read cache"); + +DEFINE_bool(read_cache_direct_write, true, + "Whether to use Direct IO for writing to the read cache"); + +DEFINE_bool(read_cache_direct_read, true, + "Whether to use Direct IO for reading from read cache"); + +DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter"); + +static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) { + if (value >= 20) { + fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n", flagname, + value); + return false; + } + return true; +} + +DEFINE_bool(verify_checksum, true, + "Verify checksum for every block read from storage"); + +DEFINE_int32(checksum_type, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().checksum, + "ChecksumType as an int"); + +DEFINE_bool(statistics, false, "Database statistics"); +DEFINE_int32(stats_level, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers, + "stats level for statistics"); +DEFINE_string(statistics_string, "", "Serialized statistics string"); +static class std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats; + +DEFINE_int64(writes, -1, + "Number of write operations to do. If negative, do --num reads."); + +DEFINE_bool(finish_after_writes, false, + "Write thread terminates after all writes are finished"); + +DEFINE_bool(sync, false, "Sync all writes to disk"); + +DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync"); + +DEFINE_bool(disable_wal, false, "If true, do not write WAL for write."); + +DEFINE_bool(manual_wal_flush, false, + "If true, buffer WAL until buffer is full or a manual FlushWAL()."); + +DEFINE_string(wal_compression, "none", + "Algorithm to use for WAL compression. none to disable."); +static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_wal_compression_e = + ROCKSDB_NAMESPACE::kNoCompression; + +DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL"); + +DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench", + "Truth key/values used when using verify"); + +DEFINE_int32(num_levels, 7, "The total number of levels"); + +DEFINE_int64(target_file_size_base, + ROCKSDB_NAMESPACE::Options().target_file_size_base, + "Target file size at level-1"); + +DEFINE_int32(target_file_size_multiplier, + ROCKSDB_NAMESPACE::Options().target_file_size_multiplier, + "A multiplier to compute target level-N file size (N >= 2)"); + +DEFINE_uint64(max_bytes_for_level_base, + ROCKSDB_NAMESPACE::Options().max_bytes_for_level_base, + "Max bytes for level-1"); + +DEFINE_bool(level_compaction_dynamic_level_bytes, false, + "Whether level size base is dynamic"); + +DEFINE_double(max_bytes_for_level_multiplier, 10, + "A multiplier to compute max bytes for level-N (N >= 2)"); + +static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v; +DEFINE_string(max_bytes_for_level_multiplier_additional, "", + "A vector that specifies additional fanout per level"); + +DEFINE_int32(level0_stop_writes_trigger, + ROCKSDB_NAMESPACE::Options().level0_stop_writes_trigger, + "Number of files in level-0 that will trigger put stop."); + +DEFINE_int32(level0_slowdown_writes_trigger, + ROCKSDB_NAMESPACE::Options().level0_slowdown_writes_trigger, + "Number of files in level-0 that will slow down writes."); + +DEFINE_int32(level0_file_num_compaction_trigger, + ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger, + "Number of files in level-0 when compactions start."); + +DEFINE_uint64(periodic_compaction_seconds, + ROCKSDB_NAMESPACE::Options().periodic_compaction_seconds, + "Files older than this will be picked up for compaction and" + " rewritten to the same level"); + +DEFINE_uint64(ttl_seconds, ROCKSDB_NAMESPACE::Options().ttl, "Set options.ttl"); + +static bool ValidateInt32Percent(const char* flagname, int32_t value) { + if (value <= 0 || value >= 100) { + fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n", flagname, + value); + return false; + } + return true; +} +DEFINE_int32(readwritepercent, 90, + "Ratio of reads to reads/writes (expressed as percentage) for " + "the ReadRandomWriteRandom workload. The default value 90 means " + "90% operations out of all reads and writes operations are " + "reads. In other words, 9 gets for every 1 put."); + +DEFINE_int32(mergereadpercent, 70, + "Ratio of merges to merges&reads (expressed as percentage) for " + "the ReadRandomMergeRandom workload. The default value 70 means " + "70% out of all read and merge operations are merges. In other " + "words, 7 merges for every 3 gets."); + +DEFINE_int32(deletepercent, 2, + "Percentage of deletes out of reads/writes/deletes (used in " + "RandomWithVerify only). RandomWithVerify " + "calculates writepercent as (100 - FLAGS_readwritepercent - " + "deletepercent), so deletepercent must be smaller than (100 - " + "FLAGS_readwritepercent)"); + +DEFINE_bool(optimize_filters_for_hits, + ROCKSDB_NAMESPACE::Options().optimize_filters_for_hits, + "Optimizes bloom filters for workloads for most lookups return " + "a value. For now this doesn't create bloom filters for the max " + "level of the LSM to reduce metadata that should fit in RAM. "); + +DEFINE_bool(paranoid_checks, ROCKSDB_NAMESPACE::Options().paranoid_checks, + "RocksDB will aggressively check consistency of the data."); + +DEFINE_bool(force_consistency_checks, + ROCKSDB_NAMESPACE::Options().force_consistency_checks, + "Runs consistency checks on the LSM every time a change is " + "applied."); + +DEFINE_bool(check_flush_compaction_key_order, + ROCKSDB_NAMESPACE::Options().check_flush_compaction_key_order, + "During flush or compaction, check whether keys inserted to " + "output files are in order."); + +DEFINE_uint64(delete_obsolete_files_period_micros, 0, + "Ignored. Left here for backward compatibility"); + +DEFINE_int64(writes_before_delete_range, 0, + "Number of writes before DeleteRange is called regularly."); + +DEFINE_int64(writes_per_range_tombstone, 0, + "Number of writes between range tombstones"); + +DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range"); + +DEFINE_int64(max_num_range_tombstones, 0, + "Maximum number of range tombstones to insert."); + +DEFINE_bool(expand_range_tombstones, false, + "Expand range tombstone into sequential regular tombstones."); + +#ifndef ROCKSDB_LITE +// Transactions Options +DEFINE_bool(optimistic_transaction_db, false, + "Open a OptimisticTransactionDB instance. " + "Required for randomtransaction benchmark."); + +DEFINE_bool(transaction_db, false, + "Open a TransactionDB instance. " + "Required for randomtransaction benchmark."); + +DEFINE_uint64(transaction_sets, 2, + "Number of keys each transaction will " + "modify (use in RandomTransaction only). Max: 9999"); + +DEFINE_bool(transaction_set_snapshot, false, + "Setting to true will have each transaction call SetSnapshot()" + " upon creation."); + +DEFINE_int32(transaction_sleep, 0, + "Max microseconds to sleep in between " + "reading and writing a value (used in RandomTransaction only). "); + +DEFINE_uint64(transaction_lock_timeout, 100, + "If using a transaction_db, specifies the lock wait timeout in" + " milliseconds before failing a transaction waiting on a lock"); +DEFINE_string( + options_file, "", + "The path to a RocksDB options file. If specified, then db_bench will " + "run with the RocksDB options in the default column family of the " + "specified options file. " + "Note that with this setting, db_bench will ONLY accept the following " + "RocksDB options related command-line arguments, all other arguments " + "that are related to RocksDB options will be ignored:\n" + "\t--use_existing_db\n" + "\t--use_existing_keys\n" + "\t--statistics\n" + "\t--row_cache_size\n" + "\t--row_cache_numshardbits\n" + "\t--enable_io_prio\n" + "\t--dump_malloc_stats\n" + "\t--num_multi_db\n"); + +// FIFO Compaction Options +DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0, + "The limit of total table file sizes to trigger FIFO compaction"); + +DEFINE_bool(fifo_compaction_allow_compaction, true, + "Allow compaction in FIFO compaction."); + +DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds."); + +DEFINE_uint64(fifo_age_for_warm, 0, "age_for_warm for FIFO compaction."); + +// Stacked BlobDB Options +DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Open a BlobDB instance."); + +DEFINE_bool( + blob_db_enable_gc, + ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection, + "[Stacked BlobDB] Enable BlobDB garbage collection."); + +DEFINE_double( + blob_db_gc_cutoff, + ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff, + "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection."); + +DEFINE_bool(blob_db_is_fifo, + ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo, + "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB."); + +DEFINE_uint64(blob_db_max_db_size, + ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size, + "[Stacked BlobDB] Max size limit of the directory where blob " + "files are stored."); + +DEFINE_uint64(blob_db_max_ttl_range, 0, + "[Stacked BlobDB] TTL range to generate BlobDB data (in " + "seconds). 0 means no TTL."); + +DEFINE_uint64( + blob_db_ttl_range_secs, + ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs, + "[Stacked BlobDB] TTL bucket size to use when creating blob files."); + +DEFINE_uint64( + blob_db_min_blob_size, + ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size, + "[Stacked BlobDB] Smallest blob to store in a file. Blobs " + "smaller than this will be inlined with the key in the LSM tree."); + +DEFINE_uint64(blob_db_bytes_per_sync, + ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync, + "[Stacked BlobDB] Bytes to sync blob file at."); + +DEFINE_uint64(blob_db_file_size, + ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size, + "[Stacked BlobDB] Target size of each blob file."); + +DEFINE_string( + blob_db_compression_type, "snappy", + "[Stacked BlobDB] Algorithm to use to compress blobs in blob files."); +static enum ROCKSDB_NAMESPACE::CompressionType + FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression; + +#endif // ROCKSDB_LITE + +// Integrated BlobDB options +DEFINE_bool( + enable_blob_files, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files, + "[Integrated BlobDB] Enable writing large values to separate blob files."); + +DEFINE_uint64(min_blob_size, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size, + "[Integrated BlobDB] The size of the smallest value to be stored " + "separately in a blob file."); + +DEFINE_uint64(blob_file_size, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size, + "[Integrated BlobDB] The size limit for blob files."); + +DEFINE_string(blob_compression_type, "none", + "[Integrated BlobDB] The compression algorithm to use for large " + "values stored in blob files."); + +DEFINE_bool(enable_blob_garbage_collection, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .enable_blob_garbage_collection, + "[Integrated BlobDB] Enable blob garbage collection."); + +DEFINE_double(blob_garbage_collection_age_cutoff, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .blob_garbage_collection_age_cutoff, + "[Integrated BlobDB] The cutoff in terms of blob file age for " + "garbage collection."); + +DEFINE_double(blob_garbage_collection_force_threshold, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .blob_garbage_collection_force_threshold, + "[Integrated BlobDB] The threshold for the ratio of garbage in " + "the oldest blob files for forcing garbage collection."); + +DEFINE_uint64(blob_compaction_readahead_size, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .blob_compaction_readahead_size, + "[Integrated BlobDB] Compaction readahead for blob files."); + +DEFINE_int32( + blob_file_starting_level, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_starting_level, + "[Integrated BlobDB] The starting level for blob files."); + +DEFINE_bool(use_blob_cache, false, "[Integrated BlobDB] Enable blob cache."); + +DEFINE_bool( + use_shared_block_and_blob_cache, true, + "[Integrated BlobDB] Use a shared backing cache for both block " + "cache and blob cache. It only takes effect if use_blob_cache is enabled."); + +DEFINE_uint64( + blob_cache_size, 8 << 20, + "[Integrated BlobDB] Number of bytes to use as a cache of blobs. It only " + "takes effect if the block and blob caches are different " + "(use_shared_block_and_blob_cache = false)."); + +DEFINE_int32(blob_cache_numshardbits, 6, + "[Integrated BlobDB] Number of shards for the blob cache is 2 ** " + "blob_cache_numshardbits. Negative means use default settings. " + "It only takes effect if blob_cache_size is greater than 0, and " + "the block and blob caches are different " + "(use_shared_block_and_blob_cache = false)."); + +DEFINE_int32(prepopulate_blob_cache, 0, + "[Integrated BlobDB] Pre-populate hot/warm blobs in blob cache. 0 " + "to disable and 1 to insert during flush."); + +#ifndef ROCKSDB_LITE + +// Secondary DB instance Options +DEFINE_bool(use_secondary_db, false, + "Open a RocksDB secondary instance. A primary instance can be " + "running in another db_bench process."); + +DEFINE_string(secondary_path, "", + "Path to a directory used by the secondary instance to store " + "private files, e.g. info log."); + +DEFINE_int32(secondary_update_interval, 5, + "Secondary instance attempts to catch up with the primary every " + "secondary_update_interval seconds."); + +#endif // ROCKSDB_LITE + +DEFINE_bool(report_bg_io_stats, false, + "Measure times spents on I/Os while in compactions. "); + +DEFINE_bool(use_stderr_info_logger, false, + "Write info logs to stderr instead of to LOG file. "); + +#ifndef ROCKSDB_LITE + +DEFINE_string(trace_file, "", "Trace workload to a file. "); + +DEFINE_double(trace_replay_fast_forward, 1.0, + "Fast forward trace replay, must > 0.0."); +DEFINE_int32(block_cache_trace_sampling_frequency, 1, + "Block cache trace sampling frequency, termed s. It uses spatial " + "downsampling and samples accesses to one out of s blocks."); +DEFINE_int64( + block_cache_trace_max_trace_file_size_in_bytes, + uint64_t{64} * 1024 * 1024 * 1024, + "The maximum block cache trace file size in bytes. Block cache accesses " + "will not be logged if the trace file size exceeds this threshold. Default " + "is 64 GB."); +DEFINE_string(block_cache_trace_file, "", "Block cache trace file path."); +DEFINE_int32(trace_replay_threads, 1, + "The number of threads to replay, must >=1."); + +DEFINE_bool(io_uring_enabled, true, + "If true, enable the use of IO uring if the platform supports it"); +extern "C" bool RocksDbIOUringEnable() { return FLAGS_io_uring_enabled; } +#endif // ROCKSDB_LITE + +DEFINE_bool(adaptive_readahead, false, + "carry forward internal auto readahead size from one file to next " + "file at each level during iteration"); + +DEFINE_bool(rate_limit_user_ops, false, + "When true use Env::IO_USER priority level to charge internal rate " + "limiter for reads associated with user operations."); + +DEFINE_bool(file_checksum, false, + "When true use FileChecksumGenCrc32cFactory for " + "file_checksum_gen_factory."); + +DEFINE_bool(rate_limit_auto_wal_flush, false, + "When true use Env::IO_USER priority level to charge internal rate " + "limiter for automatic WAL flush (`Options::manual_wal_flush` == " + "false) after the user write operation."); + +DEFINE_bool(async_io, false, + "When set true, RocksDB does asynchronous reads for internal auto " + "readahead prefetching."); + +DEFINE_bool(optimize_multiget_for_io, true, + "When set true, RocksDB does asynchronous reads for SST files in " + "multiple levels for MultiGet."); + +DEFINE_bool(charge_compression_dictionary_building_buffer, false, + "Setting for " + "CacheEntryRoleOptions::charged of " + "CacheEntryRole::kCompressionDictionaryBuildingBuffer"); + +DEFINE_bool(charge_filter_construction, false, + "Setting for " + "CacheEntryRoleOptions::charged of " + "CacheEntryRole::kFilterConstruction"); + +DEFINE_bool(charge_table_reader, false, + "Setting for " + "CacheEntryRoleOptions::charged of " + "CacheEntryRole::kBlockBasedTableReader"); + +DEFINE_bool(charge_file_metadata, false, + "Setting for " + "CacheEntryRoleOptions::charged of " + "CacheEntryRole::kFileMetadata"); + +DEFINE_bool(charge_blob_cache, false, + "Setting for " + "CacheEntryRoleOptions::charged of " + "CacheEntryRole::kBlobCache"); + +DEFINE_uint64(backup_rate_limit, 0ull, + "If non-zero, db_bench will rate limit reads and writes for DB " + "backup. This " + "is the global rate in ops/second."); + +DEFINE_uint64(restore_rate_limit, 0ull, + "If non-zero, db_bench will rate limit reads and writes for DB " + "restore. This " + "is the global rate in ops/second."); + +DEFINE_string(backup_dir, "", + "If not empty string, use the given dir for backup."); + +DEFINE_string(restore_dir, "", + "If not empty string, use the given dir for restore."); + +DEFINE_uint64( + initial_auto_readahead_size, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().initial_auto_readahead_size, + "RocksDB does auto-readahead for iterators on noticing more than two reads " + "for a table file if user doesn't provide readahead_size. The readahead " + "size starts at initial_auto_readahead_size"); + +DEFINE_uint64( + max_auto_readahead_size, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().max_auto_readahead_size, + "Rocksdb implicit readahead starts at " + "BlockBasedTableOptions.initial_auto_readahead_size and doubles on every " + "additional read upto max_auto_readahead_size"); + +DEFINE_uint64( + num_file_reads_for_auto_readahead, + ROCKSDB_NAMESPACE::BlockBasedTableOptions() + .num_file_reads_for_auto_readahead, + "Rocksdb implicit readahead is enabled if reads are sequential and " + "num_file_reads_for_auto_readahead indicates after how many sequential " + "reads into that file internal auto prefetching should be start."); + +static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType( + const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "none")) + return ROCKSDB_NAMESPACE::kNoCompression; + else if (!strcasecmp(ctype, "snappy")) + return ROCKSDB_NAMESPACE::kSnappyCompression; + else if (!strcasecmp(ctype, "zlib")) + return ROCKSDB_NAMESPACE::kZlibCompression; + else if (!strcasecmp(ctype, "bzip2")) + return ROCKSDB_NAMESPACE::kBZip2Compression; + else if (!strcasecmp(ctype, "lz4")) + return ROCKSDB_NAMESPACE::kLZ4Compression; + else if (!strcasecmp(ctype, "lz4hc")) + return ROCKSDB_NAMESPACE::kLZ4HCCompression; + else if (!strcasecmp(ctype, "xpress")) + return ROCKSDB_NAMESPACE::kXpressCompression; + else if (!strcasecmp(ctype, "zstd")) + return ROCKSDB_NAMESPACE::kZSTD; + else { + fprintf(stderr, "Cannot parse compression type '%s'\n", ctype); + exit(1); + } +} + +static std::string ColumnFamilyName(size_t i) { + if (i == 0) { + return ROCKSDB_NAMESPACE::kDefaultColumnFamilyName; + } else { + char name[100]; + snprintf(name, sizeof(name), "column_family_name_%06zu", i); + return std::string(name); + } +} + +DEFINE_string(compression_type, "snappy", + "Algorithm to use to compress the database"); +static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_compression_type_e = + ROCKSDB_NAMESPACE::kSnappyCompression; + +DEFINE_int64(sample_for_compression, 0, "Sample every N block for compression"); + +DEFINE_int32(compression_level, ROCKSDB_NAMESPACE::CompressionOptions().level, + "Compression level. The meaning of this value is library-" + "dependent. If unset, we try to use the default for the library " + "specified in `--compression_type`"); + +DEFINE_int32(compression_max_dict_bytes, + ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes, + "Maximum size of dictionary used to prime the compression " + "library."); + +DEFINE_int32(compression_zstd_max_train_bytes, + ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes, + "Maximum size of training data passed to zstd's dictionary " + "trainer."); + +DEFINE_int32(min_level_to_compress, -1, + "If non-negative, compression starts" + " from this level. Levels with number < min_level_to_compress are" + " not compressed. Otherwise, apply compression_type to " + "all levels."); + +DEFINE_int32(compression_parallel_threads, 1, + "Number of threads for parallel compression."); + +DEFINE_uint64(compression_max_dict_buffer_bytes, + ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes, + "Maximum bytes to buffer to collect samples for dictionary."); + +DEFINE_bool(compression_use_zstd_dict_trainer, + ROCKSDB_NAMESPACE::CompressionOptions().use_zstd_dict_trainer, + "If true, use ZSTD_TrainDictionary() to create dictionary, else" + "use ZSTD_FinalizeDictionary() to create dictionary"); + +static bool ValidateTableCacheNumshardbits(const char* flagname, + int32_t value) { + if (0 >= value || value >= 20) { + fprintf(stderr, "Invalid value for --%s: %d, must be 0 < val < 20\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(table_cache_numshardbits, 4, ""); + +#ifndef ROCKSDB_LITE +DEFINE_string(env_uri, "", + "URI for registry Env lookup. Mutually exclusive with --fs_uri"); +DEFINE_string(fs_uri, "", + "URI for registry Filesystem lookup. Mutually exclusive" + " with --env_uri." + " Creates a default environment with the specified filesystem."); +#endif // ROCKSDB_LITE +DEFINE_string(simulate_hybrid_fs_file, "", + "File for Store Metadata for Simulate hybrid FS. Empty means " + "disable the feature. Now, if it is set, last_level_temperature " + "is set to kWarm."); +DEFINE_int32(simulate_hybrid_hdd_multipliers, 1, + "In simulate_hybrid_fs_file or simulate_hdd mode, how many HDDs " + "are simulated."); +DEFINE_bool(simulate_hdd, false, "Simulate read/write latency on HDD."); + +DEFINE_int64( + preclude_last_level_data_seconds, 0, + "Preclude the latest data from the last level. (Used for tiered storage)"); + +DEFINE_int64(preserve_internal_time_seconds, 0, + "Preserve the internal time information which stores with SST."); + +static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard; + +static ROCKSDB_NAMESPACE::Env* FLAGS_env = ROCKSDB_NAMESPACE::Env::Default(); + +DEFINE_int64(stats_interval, 0, + "Stats are reported every N operations when this is greater than " + "zero. When 0 the interval grows over time."); + +DEFINE_int64(stats_interval_seconds, 0, + "Report stats every N seconds. This overrides stats_interval when" + " both are > 0."); + +DEFINE_int32(stats_per_interval, 0, + "Reports additional stats per interval when this is greater than " + "0."); + +DEFINE_uint64(slow_usecs, 1000000, + "A message is printed for operations that take at least this " + "many microseconds."); + +DEFINE_int64(report_interval_seconds, 0, + "If greater than zero, it will write simple stats in CSV format " + "to --report_file every N seconds"); + +DEFINE_string(report_file, "report.csv", + "Filename where some simple stats are reported to (if " + "--report_interval_seconds is bigger than 0)"); + +DEFINE_int32(thread_status_per_interval, 0, + "Takes and report a snapshot of the current status of each thread" + " when this is greater than 0."); + +DEFINE_int32(perf_level, ROCKSDB_NAMESPACE::PerfLevel::kDisable, + "Level of perf collection"); + +DEFINE_uint64(soft_pending_compaction_bytes_limit, 64ull * 1024 * 1024 * 1024, + "Slowdown writes if pending compaction bytes exceed this number"); + +DEFINE_uint64(hard_pending_compaction_bytes_limit, 128ull * 1024 * 1024 * 1024, + "Stop writes if pending compaction bytes exceed this number"); + +DEFINE_uint64(delayed_write_rate, 8388608u, + "Limited bytes allowed to DB when soft_rate_limit or " + "level0_slowdown_writes_trigger triggers"); + +DEFINE_bool(enable_pipelined_write, true, + "Allow WAL and memtable writes to be pipelined"); + +DEFINE_bool( + unordered_write, false, + "Enable the unordered write feature, which provides higher throughput but " + "relaxes the guarantees around atomic reads and immutable snapshots"); + +DEFINE_bool(allow_concurrent_memtable_write, true, + "Allow multi-writers to update mem tables in parallel."); + +DEFINE_double(experimental_mempurge_threshold, 0.0, + "Maximum useful payload ratio estimate that triggers a mempurge " + "(memtable garbage collection)."); + +DEFINE_bool(inplace_update_support, + ROCKSDB_NAMESPACE::Options().inplace_update_support, + "Support in-place memtable update for smaller or same-size values"); + +DEFINE_uint64(inplace_update_num_locks, + ROCKSDB_NAMESPACE::Options().inplace_update_num_locks, + "Number of RW locks to protect in-place memtable updates"); + +DEFINE_bool(enable_write_thread_adaptive_yield, true, + "Use a yielding spin loop for brief writer thread waits."); + +DEFINE_uint64( + write_thread_max_yield_usec, 100, + "Maximum microseconds for enable_write_thread_adaptive_yield operation."); + +DEFINE_uint64(write_thread_slow_yield_usec, 3, + "The threshold at which a slow yield is considered a signal that " + "other processes or threads want the core."); + +DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value."); + +DEFINE_int64(rate_limiter_refill_period_us, 100 * 1000, + "Set refill period on rate limiter."); + +DEFINE_bool(rate_limiter_auto_tuned, false, + "Enable dynamic adjustment of rate limit according to demand for " + "background I/O"); + +DEFINE_bool(sine_write_rate, false, "Use a sine wave write_rate_limit"); + +DEFINE_uint64( + sine_write_rate_interval_milliseconds, 10000, + "Interval of which the sine wave write_rate_limit is recalculated"); + +DEFINE_double(sine_a, 1, "A in f(x) = A sin(bx + c) + d"); + +DEFINE_double(sine_b, 1, "B in f(x) = A sin(bx + c) + d"); + +DEFINE_double(sine_c, 0, "C in f(x) = A sin(bx + c) + d"); + +DEFINE_double(sine_d, 1, "D in f(x) = A sin(bx + c) + d"); + +DEFINE_bool(rate_limit_bg_reads, false, + "Use options.rate_limiter on compaction reads"); + +DEFINE_uint64( + benchmark_write_rate_limit, 0, + "If non-zero, db_bench will rate-limit the writes going into RocksDB. This " + "is the global rate in bytes/second."); + +// the parameters of mix_graph +DEFINE_double(keyrange_dist_a, 0.0, + "The parameter 'a' of prefix average access distribution " + "f(x)=a*exp(b*x)+c*exp(d*x)"); +DEFINE_double(keyrange_dist_b, 0.0, + "The parameter 'b' of prefix average access distribution " + "f(x)=a*exp(b*x)+c*exp(d*x)"); +DEFINE_double(keyrange_dist_c, 0.0, + "The parameter 'c' of prefix average access distribution" + "f(x)=a*exp(b*x)+c*exp(d*x)"); +DEFINE_double(keyrange_dist_d, 0.0, + "The parameter 'd' of prefix average access distribution" + "f(x)=a*exp(b*x)+c*exp(d*x)"); +DEFINE_int64(keyrange_num, 1, + "The number of key ranges that are in the same prefix " + "group, each prefix range will have its key access distribution"); +DEFINE_double(key_dist_a, 0.0, + "The parameter 'a' of key access distribution model f(x)=a*x^b"); +DEFINE_double(key_dist_b, 0.0, + "The parameter 'b' of key access distribution model f(x)=a*x^b"); +DEFINE_double(value_theta, 0.0, + "The parameter 'theta' of Generized Pareto Distribution " + "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)"); +// Use reasonable defaults based on the mixgraph paper +DEFINE_double(value_k, 0.2615, + "The parameter 'k' of Generized Pareto Distribution " + "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)"); +// Use reasonable defaults based on the mixgraph paper +DEFINE_double(value_sigma, 25.45, + "The parameter 'theta' of Generized Pareto Distribution " + "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)"); +DEFINE_double(iter_theta, 0.0, + "The parameter 'theta' of Generized Pareto Distribution " + "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)"); +// Use reasonable defaults based on the mixgraph paper +DEFINE_double(iter_k, 2.517, + "The parameter 'k' of Generized Pareto Distribution " + "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)"); +// Use reasonable defaults based on the mixgraph paper +DEFINE_double(iter_sigma, 14.236, + "The parameter 'sigma' of Generized Pareto Distribution " + "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)"); +DEFINE_double(mix_get_ratio, 1.0, + "The ratio of Get queries of mix_graph workload"); +DEFINE_double(mix_put_ratio, 0.0, + "The ratio of Put queries of mix_graph workload"); +DEFINE_double(mix_seek_ratio, 0.0, + "The ratio of Seek queries of mix_graph workload"); +DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator"); +DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload"); +DEFINE_double( + sine_mix_rate_noise, 0.0, + "Add the noise ratio to the sine rate, it is between 0.0 and 1.0"); +DEFINE_bool(sine_mix_rate, false, + "Enable the sine QPS control on the mix workload"); +DEFINE_uint64( + sine_mix_rate_interval_milliseconds, 10000, + "Interval of which the sine wave read_rate_limit is recalculated"); +DEFINE_int64(mix_accesses, -1, + "The total query accesses of mix_graph workload"); + +DEFINE_uint64( + benchmark_read_rate_limit, 0, + "If non-zero, db_bench will rate-limit the reads from RocksDB. This " + "is the global rate in ops/second."); + +DEFINE_uint64(max_compaction_bytes, + ROCKSDB_NAMESPACE::Options().max_compaction_bytes, + "Max bytes allowed in one compaction"); + +#ifndef ROCKSDB_LITE +DEFINE_bool(readonly, false, "Run read only benchmarks."); + +DEFINE_bool(print_malloc_stats, false, + "Print malloc stats to stdout after benchmarks finish."); +#endif // ROCKSDB_LITE + +DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions"); + +DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds."); +DEFINE_uint64(wal_size_limit_MB, 0, + "Set the size limit for the WAL Files in MB."); +DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size"); + +DEFINE_bool(mmap_read, ROCKSDB_NAMESPACE::Options().allow_mmap_reads, + "Allow reads to occur via mmap-ing files"); + +DEFINE_bool(mmap_write, ROCKSDB_NAMESPACE::Options().allow_mmap_writes, + "Allow writes to occur via mmap-ing files"); + +DEFINE_bool(use_direct_reads, ROCKSDB_NAMESPACE::Options().use_direct_reads, + "Use O_DIRECT for reading data"); + +DEFINE_bool(use_direct_io_for_flush_and_compaction, + ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction, + "Use O_DIRECT for background flush and compaction writes"); + +DEFINE_bool(advise_random_on_open, + ROCKSDB_NAMESPACE::Options().advise_random_on_open, + "Advise random access on table file open"); + +DEFINE_string(compaction_fadvice, "NORMAL", + "Access pattern advice when a file is compacted"); +static auto FLAGS_compaction_fadvice_e = + ROCKSDB_NAMESPACE::Options().access_hint_on_compaction_start; + +DEFINE_bool(use_tailing_iterator, false, + "Use tailing iterator to access a series of keys instead of get"); + +DEFINE_bool(use_adaptive_mutex, ROCKSDB_NAMESPACE::Options().use_adaptive_mutex, + "Use adaptive mutex"); + +DEFINE_uint64(bytes_per_sync, ROCKSDB_NAMESPACE::Options().bytes_per_sync, + "Allows OS to incrementally sync SST files to disk while they are" + " being written, in the background. Issue one request for every" + " bytes_per_sync written. 0 turns it off."); + +DEFINE_uint64(wal_bytes_per_sync, + ROCKSDB_NAMESPACE::Options().wal_bytes_per_sync, + "Allows OS to incrementally sync WAL files to disk while they are" + " being written, in the background. Issue one request for every" + " wal_bytes_per_sync written. 0 turns it off."); + +DEFINE_bool(use_single_deletes, true, + "Use single deletes (used in RandomReplaceKeys only)."); + +DEFINE_double(stddev, 2000.0, + "Standard deviation of normal distribution used for picking keys" + " (used in RandomReplaceKeys only)."); + +DEFINE_int32(key_id_range, 100000, + "Range of possible value of key id (used in TimeSeries only)."); + +DEFINE_string(expire_style, "none", + "Style to remove expired time entries. Can be one of the options " + "below: none (do not expired data), compaction_filter (use a " + "compaction filter to remove expired data), delete (seek IDs and " + "remove expired data) (used in TimeSeries only)."); + +DEFINE_uint64( + time_range, 100000, + "Range of timestamp that store in the database (used in TimeSeries" + " only)."); + +DEFINE_int32(num_deletion_threads, 1, + "Number of threads to do deletion (used in TimeSeries and delete " + "expire_style only)."); + +DEFINE_int32(max_successive_merges, 0, + "Maximum number of successive merge operations on a key in the " + "memtable"); + +static bool ValidatePrefixSize(const char* flagname, int32_t value) { + if (value < 0 || value >= 2000000000) { + fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n", + flagname, value); + return false; + } + return true; +} + +DEFINE_int32(prefix_size, 0, + "control the prefix size for HashSkipList and plain table"); +DEFINE_int64(keys_per_prefix, 0, + "control average number of keys generated per prefix, 0 means no " + "special handling of the prefix, i.e. use the prefix comes with " + "the generated random number."); +DEFINE_bool(total_order_seek, false, + "Enable total order seek regardless of index format."); +DEFINE_bool(prefix_same_as_start, false, + "Enforce iterator to return keys with prefix same as seek key."); +DEFINE_bool( + seek_missing_prefix, false, + "Iterator seek to keys with non-exist prefixes. Require prefix_size > 8"); + +DEFINE_int32(memtable_insert_with_hint_prefix_size, 0, + "If non-zero, enable " + "memtable insert with hint with the given prefix size."); +DEFINE_bool(enable_io_prio, false, + "Lower the background flush/compaction threads' IO priority"); +DEFINE_bool(enable_cpu_prio, false, + "Lower the background flush/compaction threads' CPU priority"); +DEFINE_bool(identity_as_first_hash, false, + "the first hash function of cuckoo table becomes an identity " + "function. This is only valid when key is 8 bytes"); +DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG "); +DEFINE_uint64(stats_dump_period_sec, + ROCKSDB_NAMESPACE::Options().stats_dump_period_sec, + "Gap between printing stats to log in seconds"); +DEFINE_uint64(stats_persist_period_sec, + ROCKSDB_NAMESPACE::Options().stats_persist_period_sec, + "Gap between persisting stats in seconds"); +DEFINE_bool(persist_stats_to_disk, + ROCKSDB_NAMESPACE::Options().persist_stats_to_disk, + "whether to persist stats to disk"); +DEFINE_uint64(stats_history_buffer_size, + ROCKSDB_NAMESPACE::Options().stats_history_buffer_size, + "Max number of stats snapshots to keep in memory"); +DEFINE_bool(avoid_flush_during_recovery, + ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery, + "If true, avoids flushing the recovered WAL data where possible."); +DEFINE_int64(multiread_stride, 0, + "Stride length for the keys in a MultiGet batch"); +DEFINE_bool(multiread_batched, false, "Use the new MultiGet API"); + +DEFINE_string(memtablerep, "skip_list", ""); +DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count"); +DEFINE_bool(use_plain_table, false, + "if use plain table instead of block-based table format"); +DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format"); +DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table."); +DEFINE_bool(use_hash_search, false, + "if use kHashSearch instead of kBinarySearch. " + "This is valid if only we use BlockTable"); +DEFINE_string(merge_operator, "", + "The merge operator to use with the database." + "If a new merge operator is specified, be sure to use fresh" + " database The possible merge operators are defined in" + " utilities/merge_operators.h"); +DEFINE_int32(skip_list_lookahead, 0, + "Used with skip_list memtablerep; try linear search first for " + "this many steps from the previous position"); +DEFINE_bool(report_file_operations, false, + "if report number of file operations"); +DEFINE_bool(report_open_timing, false, "if report open timing"); +DEFINE_int32(readahead_size, 0, "Iterator readahead size"); + +DEFINE_bool(read_with_latest_user_timestamp, true, + "If true, always use the current latest timestamp for read. If " + "false, choose a random timestamp from the past."); + +#ifndef ROCKSDB_LITE +DEFINE_string(secondary_cache_uri, "", + "Full URI for creating a custom secondary cache object"); +static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache; +#endif // ROCKSDB_LITE + +static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) = + RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize); + +static const bool FLAGS_key_size_dummy __attribute__((__unused__)) = + RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize); + +static const bool FLAGS_cache_numshardbits_dummy __attribute__((__unused__)) = + RegisterFlagValidator(&FLAGS_cache_numshardbits, + &ValidateCacheNumshardbits); + +static const bool FLAGS_readwritepercent_dummy __attribute__((__unused__)) = + RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent); + +DEFINE_int32(disable_seek_compaction, false, + "Not used, left here for backwards compatibility"); + +DEFINE_bool(allow_data_in_errors, + ROCKSDB_NAMESPACE::Options().allow_data_in_errors, + "If true, allow logging data, e.g. key, value in LOG files."); + +static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) = + RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent); +static const bool FLAGS_table_cache_numshardbits_dummy + __attribute__((__unused__)) = RegisterFlagValidator( + &FLAGS_table_cache_numshardbits, &ValidateTableCacheNumshardbits); + +DEFINE_uint32(write_batch_protection_bytes_per_key, 0, + "Size of per-key-value checksum in each write batch. Currently " + "only value 0 and 8 are supported."); + +DEFINE_uint32( + memtable_protection_bytes_per_key, 0, + "Enable memtable per key-value checksum protection. " + "Each entry in memtable will be suffixed by a per key-value checksum. " + "This options determines the size of such checksums. " + "Supported values: 0, 1, 2, 4, 8."); + +DEFINE_bool(build_info, false, + "Print the build info via GetRocksBuildInfoAsString"); + +DEFINE_bool(track_and_verify_wals_in_manifest, false, + "If true, enable WAL tracking in the MANIFEST"); + +namespace ROCKSDB_NAMESPACE { +namespace { +static Status CreateMemTableRepFactory( + const ConfigOptions& config_options, + std::shared_ptr<MemTableRepFactory>* factory) { + Status s; + if (!strcasecmp(FLAGS_memtablerep.c_str(), SkipListFactory::kNickName())) { + factory->reset(new SkipListFactory(FLAGS_skip_list_lookahead)); +#ifndef ROCKSDB_LITE + } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "prefix_hash")) { + factory->reset(NewHashSkipListRepFactory(FLAGS_hash_bucket_count)); + } else if (!strcasecmp(FLAGS_memtablerep.c_str(), + VectorRepFactory::kNickName())) { + factory->reset(new VectorRepFactory()); + } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "hash_linkedlist")) { + factory->reset(NewHashLinkListRepFactory(FLAGS_hash_bucket_count)); +#endif // ROCKSDB_LITE + } else { + std::unique_ptr<MemTableRepFactory> unique; + s = MemTableRepFactory::CreateFromString(config_options, FLAGS_memtablerep, + &unique); + if (s.ok()) { + factory->reset(unique.release()); + } + } + return s; +} + +} // namespace + +enum DistributionType : unsigned char { kFixed = 0, kUniform, kNormal }; + +static enum DistributionType FLAGS_value_size_distribution_type_e = kFixed; + +static enum DistributionType StringToDistributionType(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "fixed")) + return kFixed; + else if (!strcasecmp(ctype, "uniform")) + return kUniform; + else if (!strcasecmp(ctype, "normal")) + return kNormal; + + fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype); + exit(1); +} + +class BaseDistribution { + public: + BaseDistribution(unsigned int _min, unsigned int _max) + : min_value_size_(_min), max_value_size_(_max) {} + virtual ~BaseDistribution() {} + + unsigned int Generate() { + auto val = Get(); + if (NeedTruncate()) { + val = std::max(min_value_size_, val); + val = std::min(max_value_size_, val); + } + return val; + } + + private: + virtual unsigned int Get() = 0; + virtual bool NeedTruncate() { return true; } + unsigned int min_value_size_; + unsigned int max_value_size_; +}; + +class FixedDistribution : public BaseDistribution { + public: + FixedDistribution(unsigned int size) + : BaseDistribution(size, size), size_(size) {} + + private: + virtual unsigned int Get() override { return size_; } + virtual bool NeedTruncate() override { return false; } + unsigned int size_; +}; + +class NormalDistribution : public BaseDistribution, + public std::normal_distribution<double> { + public: + NormalDistribution(unsigned int _min, unsigned int _max) + : BaseDistribution(_min, _max), + // 99.7% values within the range [min, max]. + std::normal_distribution<double>( + (double)(_min + _max) / 2.0 /*mean*/, + (double)(_max - _min) / 6.0 /*stddev*/), + gen_(rd_()) {} + + private: + virtual unsigned int Get() override { + return static_cast<unsigned int>((*this)(gen_)); + } + std::random_device rd_; + std::mt19937 gen_; +}; + +class UniformDistribution : public BaseDistribution, + public std::uniform_int_distribution<unsigned int> { + public: + UniformDistribution(unsigned int _min, unsigned int _max) + : BaseDistribution(_min, _max), + std::uniform_int_distribution<unsigned int>(_min, _max), + gen_(rd_()) {} + + private: + virtual unsigned int Get() override { return (*this)(gen_); } + virtual bool NeedTruncate() override { return false; } + std::random_device rd_; + std::mt19937 gen_; +}; + +// Helper for quickly generating random data. +class RandomGenerator { + private: + std::string data_; + unsigned int pos_; + std::unique_ptr<BaseDistribution> dist_; + + public: + RandomGenerator() { + auto max_value_size = FLAGS_value_size_max; + switch (FLAGS_value_size_distribution_type_e) { + case kUniform: + dist_.reset(new UniformDistribution(FLAGS_value_size_min, + FLAGS_value_size_max)); + break; + case kNormal: + dist_.reset( + new NormalDistribution(FLAGS_value_size_min, FLAGS_value_size_max)); + break; + case kFixed: + default: + dist_.reset(new FixedDistribution(value_size)); + max_value_size = value_size; + } + // We use a limited amount of data over and over again and ensure + // that it is larger than the compression window (32KB), and also + // large enough to serve all typical value sizes we want to write. + Random rnd(301); + std::string piece; + while (data_.size() < (unsigned)std::max(1048576, max_value_size)) { + // Add a short fragment that is as compressible as specified + // by FLAGS_compression_ratio. + test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); + data_.append(piece); + } + pos_ = 0; + } + + Slice Generate(unsigned int len) { + assert(len <= data_.size()); + if (pos_ + len > data_.size()) { + pos_ = 0; + } + pos_ += len; + return Slice(data_.data() + pos_ - len, len); + } + + Slice Generate() { + auto len = dist_->Generate(); + return Generate(len); + } +}; + +static void AppendWithSpace(std::string* str, Slice msg) { + if (msg.empty()) return; + if (!str->empty()) { + str->push_back(' '); + } + str->append(msg.data(), msg.size()); +} + +struct DBWithColumnFamilies { + std::vector<ColumnFamilyHandle*> cfh; + DB* db; +#ifndef ROCKSDB_LITE + OptimisticTransactionDB* opt_txn_db; +#endif // ROCKSDB_LITE + std::atomic<size_t> num_created; // Need to be updated after all the + // new entries in cfh are set. + size_t num_hot; // Number of column families to be queried at each moment. + // After each CreateNewCf(), another num_hot number of new + // Column families will be created and used to be queried. + port::Mutex create_cf_mutex; // Only one thread can execute CreateNewCf() + std::vector<int> cfh_idx_to_prob; // ith index holds probability of operating + // on cfh[i]. + + DBWithColumnFamilies() + : db(nullptr) +#ifndef ROCKSDB_LITE + , + opt_txn_db(nullptr) +#endif // ROCKSDB_LITE + { + cfh.clear(); + num_created = 0; + num_hot = 0; + } + + DBWithColumnFamilies(const DBWithColumnFamilies& other) + : cfh(other.cfh), + db(other.db), +#ifndef ROCKSDB_LITE + opt_txn_db(other.opt_txn_db), +#endif // ROCKSDB_LITE + num_created(other.num_created.load()), + num_hot(other.num_hot), + cfh_idx_to_prob(other.cfh_idx_to_prob) { + } + + void DeleteDBs() { + std::for_each(cfh.begin(), cfh.end(), + [](ColumnFamilyHandle* cfhi) { delete cfhi; }); + cfh.clear(); +#ifndef ROCKSDB_LITE + if (opt_txn_db) { + delete opt_txn_db; + opt_txn_db = nullptr; + } else { + delete db; + db = nullptr; + } +#else + delete db; + db = nullptr; +#endif // ROCKSDB_LITE + } + + ColumnFamilyHandle* GetCfh(int64_t rand_num) { + assert(num_hot > 0); + size_t rand_offset = 0; + if (!cfh_idx_to_prob.empty()) { + assert(cfh_idx_to_prob.size() == num_hot); + int sum = 0; + while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) { + sum += cfh_idx_to_prob[rand_offset]; + ++rand_offset; + } + assert(rand_offset < cfh_idx_to_prob.size()); + } else { + rand_offset = rand_num % num_hot; + } + return cfh[num_created.load(std::memory_order_acquire) - num_hot + + rand_offset]; + } + + // stage: assume CF from 0 to stage * num_hot has be created. Need to create + // stage * num_hot + 1 to stage * (num_hot + 1). + void CreateNewCf(ColumnFamilyOptions options, int64_t stage) { + MutexLock l(&create_cf_mutex); + if ((stage + 1) * num_hot <= num_created) { + // Already created. + return; + } + auto new_num_created = num_created + num_hot; + assert(new_num_created <= cfh.size()); + for (size_t i = num_created; i < new_num_created; i++) { + Status s = + db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i])); + if (!s.ok()) { + fprintf(stderr, "create column family error: %s\n", + s.ToString().c_str()); + abort(); + } + } + num_created.store(new_num_created, std::memory_order_release); + } +}; + +// A class that reports stats to CSV file. +class ReporterAgent { + public: + ReporterAgent(Env* env, const std::string& fname, + uint64_t report_interval_secs) + : env_(env), + total_ops_done_(0), + last_report_(0), + report_interval_secs_(report_interval_secs), + stop_(false) { + auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions()); + if (s.ok()) { + s = report_file_->Append(Header() + "\n"); + } + if (s.ok()) { + s = report_file_->Flush(); + } + if (!s.ok()) { + fprintf(stderr, "Can't open %s: %s\n", fname.c_str(), + s.ToString().c_str()); + abort(); + } + + reporting_thread_ = port::Thread([&]() { SleepAndReport(); }); + } + + ~ReporterAgent() { + { + std::unique_lock<std::mutex> lk(mutex_); + stop_ = true; + stop_cv_.notify_all(); + } + reporting_thread_.join(); + } + + // thread safe + void ReportFinishedOps(int64_t num_ops) { + total_ops_done_.fetch_add(num_ops); + } + + private: + std::string Header() const { return "secs_elapsed,interval_qps"; } + void SleepAndReport() { + auto* clock = env_->GetSystemClock().get(); + auto time_started = clock->NowMicros(); + while (true) { + { + std::unique_lock<std::mutex> lk(mutex_); + if (stop_ || + stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_), + [&]() { return stop_; })) { + // stopping + break; + } + // else -> timeout, which means time for a report! + } + auto total_ops_done_snapshot = total_ops_done_.load(); + // round the seconds elapsed + auto secs_elapsed = + (clock->NowMicros() - time_started + kMicrosInSecond / 2) / + kMicrosInSecond; + std::string report = + std::to_string(secs_elapsed) + "," + + std::to_string(total_ops_done_snapshot - last_report_) + "\n"; + auto s = report_file_->Append(report); + if (s.ok()) { + s = report_file_->Flush(); + } + if (!s.ok()) { + fprintf(stderr, + "Can't write to report file (%s), stopping the reporting\n", + s.ToString().c_str()); + break; + } + last_report_ = total_ops_done_snapshot; + } + } + + Env* env_; + std::unique_ptr<WritableFile> report_file_; + std::atomic<int64_t> total_ops_done_; + int64_t last_report_; + const uint64_t report_interval_secs_; + ROCKSDB_NAMESPACE::port::Thread reporting_thread_; + std::mutex mutex_; + // will notify on stop + std::condition_variable stop_cv_; + bool stop_; +}; + +enum OperationType : unsigned char { + kRead = 0, + kWrite, + kDelete, + kSeek, + kMerge, + kUpdate, + kCompress, + kUncompress, + kCrc, + kHash, + kOthers +}; + +static std::unordered_map<OperationType, std::string, std::hash<unsigned char>> + OperationTypeString = {{kRead, "read"}, {kWrite, "write"}, + {kDelete, "delete"}, {kSeek, "seek"}, + {kMerge, "merge"}, {kUpdate, "update"}, + {kCompress, "compress"}, {kCompress, "uncompress"}, + {kCrc, "crc"}, {kHash, "hash"}, + {kOthers, "op"}}; + +class CombinedStats; +class Stats { + private: + SystemClock* clock_; + int id_; + uint64_t start_ = 0; + uint64_t sine_interval_; + uint64_t finish_; + double seconds_; + uint64_t done_; + uint64_t last_report_done_; + uint64_t next_report_; + uint64_t bytes_; + uint64_t last_op_finish_; + uint64_t last_report_finish_; + std::unordered_map<OperationType, std::shared_ptr<HistogramImpl>, + std::hash<unsigned char>> + hist_; + std::string message_; + bool exclude_from_merge_; + ReporterAgent* reporter_agent_; // does not own + friend class CombinedStats; + + public: + Stats() : clock_(FLAGS_env->GetSystemClock().get()) { Start(-1); } + + void SetReporterAgent(ReporterAgent* reporter_agent) { + reporter_agent_ = reporter_agent; + } + + void Start(int id) { + id_ = id; + next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100; + last_op_finish_ = start_; + hist_.clear(); + done_ = 0; + last_report_done_ = 0; + bytes_ = 0; + seconds_ = 0; + start_ = clock_->NowMicros(); + sine_interval_ = clock_->NowMicros(); + finish_ = start_; + last_report_finish_ = start_; + message_.clear(); + // When set, stats from this thread won't be merged with others. + exclude_from_merge_ = false; + } + + void Merge(const Stats& other) { + if (other.exclude_from_merge_) return; + + for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) { + auto this_it = hist_.find(it->first); + if (this_it != hist_.end()) { + this_it->second->Merge(*(other.hist_.at(it->first))); + } else { + hist_.insert({it->first, it->second}); + } + } + + done_ += other.done_; + bytes_ += other.bytes_; + seconds_ += other.seconds_; + if (other.start_ < start_) start_ = other.start_; + if (other.finish_ > finish_) finish_ = other.finish_; + + // Just keep the messages from one thread. + if (message_.empty()) message_ = other.message_; + } + + void Stop() { + finish_ = clock_->NowMicros(); + seconds_ = (finish_ - start_) * 1e-6; + } + + void AddMessage(Slice msg) { AppendWithSpace(&message_, msg); } + + void SetId(int id) { id_ = id; } + void SetExcludeFromMerge() { exclude_from_merge_ = true; } + + void PrintThreadStatus() { + std::vector<ThreadStatus> thread_list; + FLAGS_env->GetThreadList(&thread_list); + + fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n", "ThreadID", + "ThreadType", "cfName", "Operation", "ElapsedTime", "Stage", + "State", "OperationProperties"); + + int64_t current_time = 0; + clock_->GetCurrentTime(¤t_time).PermitUncheckedError(); + for (auto ts : thread_list) { + fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s", + ts.thread_id, + ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(), + ts.cf_name.c_str(), + ThreadStatus::GetOperationName(ts.operation_type).c_str(), + ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(), + ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(), + ThreadStatus::GetStateName(ts.state_type).c_str()); + + auto op_properties = ThreadStatus::InterpretOperationProperties( + ts.operation_type, ts.op_properties); + for (const auto& op_prop : op_properties) { + fprintf(stderr, " %s %" PRIu64 " |", op_prop.first.c_str(), + op_prop.second); + } + fprintf(stderr, "\n"); + } + } + + void ResetSineInterval() { sine_interval_ = clock_->NowMicros(); } + + uint64_t GetSineInterval() { return sine_interval_; } + + uint64_t GetStart() { return start_; } + + void ResetLastOpTime() { + // Set to now to avoid latency from calls to SleepForMicroseconds. + last_op_finish_ = clock_->NowMicros(); + } + + void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops, + enum OperationType op_type = kOthers) { + if (reporter_agent_) { + reporter_agent_->ReportFinishedOps(num_ops); + } + if (FLAGS_histogram) { + uint64_t now = clock_->NowMicros(); + uint64_t micros = now - last_op_finish_; + + if (hist_.find(op_type) == hist_.end()) { + auto hist_temp = std::make_shared<HistogramImpl>(); + hist_.insert({op_type, std::move(hist_temp)}); + } + hist_[op_type]->Add(micros); + + if (micros >= FLAGS_slow_usecs && !FLAGS_stats_interval) { + fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, ""); + fflush(stderr); + } + last_op_finish_ = now; + } + + done_ += num_ops; + if (done_ >= next_report_ && FLAGS_progress_reports) { + if (!FLAGS_stats_interval) { + if (next_report_ < 1000) + next_report_ += 100; + else if (next_report_ < 5000) + next_report_ += 500; + else if (next_report_ < 10000) + next_report_ += 1000; + else if (next_report_ < 50000) + next_report_ += 5000; + else if (next_report_ < 100000) + next_report_ += 10000; + else if (next_report_ < 500000) + next_report_ += 50000; + else + next_report_ += 100000; + fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, ""); + } else { + uint64_t now = clock_->NowMicros(); + int64_t usecs_since_last = now - last_report_finish_; + + // Determine whether to print status where interval is either + // each N operations or each N seconds. + + if (FLAGS_stats_interval_seconds && + usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) { + // Don't check again for this many operations. + next_report_ += FLAGS_stats_interval; + + } else { + fprintf(stderr, + "%s ... thread %d: (%" PRIu64 ",%" PRIu64 + ") ops and " + "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n", + clock_->TimeToString(now / 1000000).c_str(), id_, + done_ - last_report_done_, done_, + (done_ - last_report_done_) / (usecs_since_last / 1000000.0), + done_ / ((now - start_) / 1000000.0), + (now - last_report_finish_) / 1000000.0, + (now - start_) / 1000000.0); + + if (id_ == 0 && FLAGS_stats_per_interval) { + std::string stats; + + if (db_with_cfh && db_with_cfh->num_created.load()) { + for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) { + if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats", + &stats)) + fprintf(stderr, "%s\n", stats.c_str()); + if (FLAGS_show_table_properties) { + for (int level = 0; level < FLAGS_num_levels; ++level) { + if (db->GetProperty( + db_with_cfh->cfh[i], + "rocksdb.aggregated-table-properties-at-level" + + std::to_string(level), + &stats)) { + if (stats.find("# entries=0") == std::string::npos) { + fprintf(stderr, "Level[%d]: %s\n", level, + stats.c_str()); + } + } + } + } + } + } else if (db) { + if (db->GetProperty("rocksdb.stats", &stats)) { + fprintf(stderr, "%s", stats.c_str()); + } + if (db->GetProperty("rocksdb.num-running-compactions", &stats)) { + fprintf(stderr, "num-running-compactions: %s\n", stats.c_str()); + } + if (db->GetProperty("rocksdb.num-running-flushes", &stats)) { + fprintf(stderr, "num-running-flushes: %s\n\n", stats.c_str()); + } + if (FLAGS_show_table_properties) { + for (int level = 0; level < FLAGS_num_levels; ++level) { + if (db->GetProperty( + "rocksdb.aggregated-table-properties-at-level" + + std::to_string(level), + &stats)) { + if (stats.find("# entries=0") == std::string::npos) { + fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str()); + } + } + } + } + } + } + + next_report_ += FLAGS_stats_interval; + last_report_finish_ = now; + last_report_done_ = done_; + } + } + if (id_ == 0 && FLAGS_thread_status_per_interval) { + PrintThreadStatus(); + } + fflush(stderr); + } + } + + void AddBytes(int64_t n) { bytes_ += n; } + + void Report(const Slice& name) { + // Pretend at least one op was done in case we are running a benchmark + // that does not call FinishedOps(). + if (done_ < 1) done_ = 1; + + std::string extra; + double elapsed = (finish_ - start_) * 1e-6; + if (bytes_ > 0) { + // Rate is computed on actual elapsed time, not the sum of per-thread + // elapsed times. + char rate[100]; + snprintf(rate, sizeof(rate), "%6.1f MB/s", + (bytes_ / 1048576.0) / elapsed); + extra = rate; + } + AppendWithSpace(&extra, message_); + double throughput = (double)done_ / elapsed; + + fprintf(stdout, + "%-12s : %11.3f micros/op %ld ops/sec %.3f seconds %" PRIu64 + " operations;%s%s\n", + name.ToString().c_str(), seconds_ * 1e6 / done_, (long)throughput, + elapsed, done_, (extra.empty() ? "" : " "), extra.c_str()); + if (FLAGS_histogram) { + for (auto it = hist_.begin(); it != hist_.end(); ++it) { + fprintf(stdout, "Microseconds per %s:\n%s\n", + OperationTypeString[it->first].c_str(), + it->second->ToString().c_str()); + } + } + if (FLAGS_report_file_operations) { + auto* counted_fs = + FLAGS_env->GetFileSystem()->CheckedCast<CountedFileSystem>(); + assert(counted_fs); + fprintf(stdout, "%s", counted_fs->PrintCounters().c_str()); + counted_fs->ResetCounters(); + } + fflush(stdout); + } +}; + +class CombinedStats { + public: + void AddStats(const Stats& stat) { + uint64_t total_ops = stat.done_; + uint64_t total_bytes_ = stat.bytes_; + double elapsed; + + if (total_ops < 1) { + total_ops = 1; + } + + elapsed = (stat.finish_ - stat.start_) * 1e-6; + throughput_ops_.emplace_back(total_ops / elapsed); + + if (total_bytes_ > 0) { + double mbs = (total_bytes_ / 1048576.0); + throughput_mbs_.emplace_back(mbs / elapsed); + } + } + + void Report(const std::string& bench_name) { + if (throughput_ops_.size() < 2) { + // skip if there are not enough samples + return; + } + + const char* name = bench_name.c_str(); + int num_runs = static_cast<int>(throughput_ops_.size()); + + if (throughput_mbs_.size() == throughput_ops_.size()) { + fprintf(stdout, + "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 " + "%.1f) MB/sec\n", + name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)), + static_cast<int>(CalcConfidence95(throughput_ops_)), + CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_)); + } else { + fprintf(stdout, "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec\n", name, + num_runs, static_cast<int>(CalcAvg(throughput_ops_)), + static_cast<int>(CalcConfidence95(throughput_ops_))); + } + } + + void ReportWithConfidenceIntervals(const std::string& bench_name) { + if (throughput_ops_.size() < 2) { + // skip if there are not enough samples + return; + } + + const char* name = bench_name.c_str(); + int num_runs = static_cast<int>(throughput_ops_.size()); + + int ops_avg = static_cast<int>(CalcAvg(throughput_ops_)); + int ops_confidence_95 = static_cast<int>(CalcConfidence95(throughput_ops_)); + + if (throughput_mbs_.size() == throughput_ops_.size()) { + double mbs_avg = CalcAvg(throughput_mbs_); + double mbs_confidence_95 = CalcConfidence95(throughput_mbs_); + fprintf(stdout, + "%s [CI95 %d runs] : (%d, %d) ops/sec; (%.1f, %.1f) MB/sec\n", + name, num_runs, ops_avg - ops_confidence_95, + ops_avg + ops_confidence_95, mbs_avg - mbs_confidence_95, + mbs_avg + mbs_confidence_95); + } else { + fprintf(stdout, "%s [CI95 %d runs] : (%d, %d) ops/sec\n", name, num_runs, + ops_avg - ops_confidence_95, ops_avg + ops_confidence_95); + } + } + + void ReportFinal(const std::string& bench_name) { + if (throughput_ops_.size() < 2) { + // skip if there are not enough samples + return; + } + + const char* name = bench_name.c_str(); + int num_runs = static_cast<int>(throughput_ops_.size()); + + if (throughput_mbs_.size() == throughput_ops_.size()) { + // \xC2\xB1 is +/- character in UTF-8 + fprintf(stdout, + "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 " + "%.1f) MB/sec\n" + "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n", + name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)), + static_cast<int>(CalcConfidence95(throughput_ops_)), + CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_), name, + num_runs, static_cast<int>(CalcMedian(throughput_ops_)), + CalcMedian(throughput_mbs_)); + } else { + fprintf(stdout, + "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec\n" + "%s [MEDIAN %d runs] : %d ops/sec\n", + name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)), + static_cast<int>(CalcConfidence95(throughput_ops_)), name, + num_runs, static_cast<int>(CalcMedian(throughput_ops_))); + } + } + + private: + double CalcAvg(std::vector<double>& data) { + double avg = 0; + for (double x : data) { + avg += x; + } + avg = avg / data.size(); + return avg; + } + + // Calculates 95% CI assuming a normal distribution of samples. + // Samples are not from a normal distribution, but it still + // provides useful approximation. + double CalcConfidence95(std::vector<double>& data) { + assert(data.size() > 1); + double avg = CalcAvg(data); + double std_error = CalcStdDev(data, avg) / std::sqrt(data.size()); + + // Z score for the 97.5 percentile + // see https://en.wikipedia.org/wiki/1.96 + return 1.959964 * std_error; + } + + double CalcMedian(std::vector<double>& data) { + assert(data.size() > 0); + std::sort(data.begin(), data.end()); + + size_t mid = data.size() / 2; + if (data.size() % 2 == 1) { + // Odd number of entries + return data[mid]; + } else { + // Even number of entries + return (data[mid] + data[mid - 1]) / 2; + } + } + + double CalcStdDev(std::vector<double>& data, double average) { + assert(data.size() > 1); + double squared_sum = 0.0; + for (double x : data) { + squared_sum += std::pow(x - average, 2); + } + + // using samples count - 1 following Bessel's correction + // see https://en.wikipedia.org/wiki/Bessel%27s_correction + return std::sqrt(squared_sum / (data.size() - 1)); + } + + std::vector<double> throughput_ops_; + std::vector<double> throughput_mbs_; +}; + +class TimestampEmulator { + private: + std::atomic<uint64_t> timestamp_; + + public: + TimestampEmulator() : timestamp_(0) {} + uint64_t Get() const { return timestamp_.load(); } + void Inc() { timestamp_++; } + Slice Allocate(char* scratch) { + // TODO: support larger timestamp sizes + assert(FLAGS_user_timestamp_size == 8); + assert(scratch); + uint64_t ts = timestamp_.fetch_add(1); + EncodeFixed64(scratch, ts); + return Slice(scratch, FLAGS_user_timestamp_size); + } + Slice GetTimestampForRead(Random64& rand, char* scratch) { + assert(FLAGS_user_timestamp_size == 8); + assert(scratch); + if (FLAGS_read_with_latest_user_timestamp) { + return Allocate(scratch); + } + // Choose a random timestamp from the past. + uint64_t ts = rand.Next() % Get(); + EncodeFixed64(scratch, ts); + return Slice(scratch, FLAGS_user_timestamp_size); + } +}; + +// State shared by all concurrent executions of the same benchmark. +struct SharedState { + port::Mutex mu; + port::CondVar cv; + int total; + int perf_level; + std::shared_ptr<RateLimiter> write_rate_limiter; + std::shared_ptr<RateLimiter> read_rate_limiter; + + // Each thread goes through the following states: + // (1) initializing + // (2) waiting for others to be initialized + // (3) running + // (4) done + + long num_initialized; + long num_done; + bool start; + + SharedState() : cv(&mu), perf_level(FLAGS_perf_level) {} +}; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + int tid; // 0..n-1 when running in n threads + Random64 rand; // Has different seeds for different threads + Stats stats; + SharedState* shared; + + explicit ThreadState(int index, int my_seed) + : tid(index), rand(seed_base + my_seed) {} +}; + +class Duration { + public: + Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) { + max_seconds_ = max_seconds; + max_ops_ = max_ops; + ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops; + ops_ = 0; + start_at_ = FLAGS_env->NowMicros(); + } + + int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; } + + bool Done(int64_t increment) { + if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops + ops_ += increment; + + if (max_seconds_) { + // Recheck every appx 1000 ops (exact iff increment is factor of 1000) + auto granularity = FLAGS_ops_between_duration_checks; + if ((ops_ / granularity) != ((ops_ - increment) / granularity)) { + uint64_t now = FLAGS_env->NowMicros(); + return ((now - start_at_) / 1000000) >= max_seconds_; + } else { + return false; + } + } else { + return ops_ > max_ops_; + } + } + + private: + uint64_t max_seconds_; + int64_t max_ops_; + int64_t ops_per_stage_; + int64_t ops_; + uint64_t start_at_; +}; + +class Benchmark { + private: + std::shared_ptr<Cache> cache_; + std::shared_ptr<Cache> compressed_cache_; + std::shared_ptr<const SliceTransform> prefix_extractor_; + DBWithColumnFamilies db_; + std::vector<DBWithColumnFamilies> multi_dbs_; + int64_t num_; + int key_size_; + int user_timestamp_size_; + int prefix_size_; + int total_thread_count_; + int64_t keys_per_prefix_; + int64_t entries_per_batch_; + int64_t writes_before_delete_range_; + int64_t writes_per_range_tombstone_; + int64_t range_tombstone_width_; + int64_t max_num_range_tombstones_; + ReadOptions read_options_; + WriteOptions write_options_; + Options open_options_; // keep options around to properly destroy db later +#ifndef ROCKSDB_LITE + TraceOptions trace_options_; + TraceOptions block_cache_trace_options_; +#endif + int64_t reads_; + int64_t deletes_; + double read_random_exp_range_; + int64_t writes_; + int64_t readwrites_; + int64_t merge_keys_; + bool report_file_operations_; + bool use_blob_db_; // Stacked BlobDB + bool read_operands_; // read via GetMergeOperands() + std::vector<std::string> keys_; + + class ErrorHandlerListener : public EventListener { + public: +#ifndef ROCKSDB_LITE + ErrorHandlerListener() + : mutex_(), + cv_(&mutex_), + no_auto_recovery_(false), + recovery_complete_(false) {} + + ~ErrorHandlerListener() override {} + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "ErrorHandlerListener"; } + + void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/, + Status /*bg_error*/, + bool* auto_recovery) override { + if (*auto_recovery && no_auto_recovery_) { + *auto_recovery = false; + } + } + + void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override { + InstrumentedMutexLock l(&mutex_); + recovery_complete_ = true; + cv_.SignalAll(); + } + + bool WaitForRecovery(uint64_t abs_time_us) { + InstrumentedMutexLock l(&mutex_); + if (!recovery_complete_) { + cv_.TimedWait(abs_time_us); + } + if (recovery_complete_) { + recovery_complete_ = false; + return true; + } + return false; + } + + void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; } + + private: + InstrumentedMutex mutex_; + InstrumentedCondVar cv_; + bool no_auto_recovery_; + bool recovery_complete_; +#else // ROCKSDB_LITE + bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; } + void EnableAutoRecovery(bool /*enable*/) {} +#endif // ROCKSDB_LITE + }; + + std::shared_ptr<ErrorHandlerListener> listener_; + + std::unique_ptr<TimestampEmulator> mock_app_clock_; + + bool SanityCheck() { + if (FLAGS_compression_ratio > 1) { + fprintf(stderr, "compression_ratio should be between 0 and 1\n"); + return false; + } + return true; + } + + inline bool CompressSlice(const CompressionInfo& compression_info, + const Slice& input, std::string* compressed) { + constexpr uint32_t compress_format_version = 2; + + return CompressData(input, compression_info, compress_format_version, + compressed); + } + + void PrintHeader(const Options& options) { + PrintEnvironment(); + fprintf(stdout, + "Keys: %d bytes each (+ %d bytes user-defined timestamp)\n", + FLAGS_key_size, FLAGS_user_timestamp_size); + auto avg_value_size = FLAGS_value_size; + if (FLAGS_value_size_distribution_type_e == kFixed) { + fprintf(stdout, + "Values: %d bytes each (%d bytes after compression)\n", + avg_value_size, + static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5)); + } else { + avg_value_size = (FLAGS_value_size_min + FLAGS_value_size_max) / 2; + fprintf(stdout, + "Values: %d avg bytes each (%d bytes after compression)\n", + avg_value_size, + static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5)); + fprintf(stdout, "Values Distribution: %s (min: %d, max: %d)\n", + FLAGS_value_size_distribution_type.c_str(), FLAGS_value_size_min, + FLAGS_value_size_max); + } + fprintf(stdout, "Entries: %" PRIu64 "\n", num_); + fprintf(stdout, "Prefix: %d bytes\n", FLAGS_prefix_size); + fprintf(stdout, "Keys per prefix: %" PRIu64 "\n", keys_per_prefix_); + fprintf(stdout, "RawSize: %.1f MB (estimated)\n", + ((static_cast<int64_t>(FLAGS_key_size + avg_value_size) * num_) / + 1048576.0)); + fprintf( + stdout, "FileSize: %.1f MB (estimated)\n", + (((FLAGS_key_size + avg_value_size * FLAGS_compression_ratio) * num_) / + 1048576.0)); + fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n", + FLAGS_benchmark_write_rate_limit); + fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n", + FLAGS_benchmark_read_rate_limit); + if (FLAGS_enable_numa) { + fprintf(stderr, "Running in NUMA enabled mode.\n"); +#ifndef NUMA + fprintf(stderr, "NUMA is not defined in the system.\n"); + exit(1); +#else + if (numa_available() == -1) { + fprintf(stderr, "NUMA is not supported by the system.\n"); + exit(1); + } +#endif + } + + auto compression = CompressionTypeToString(FLAGS_compression_type_e); + fprintf(stdout, "Compression: %s\n", compression.c_str()); + fprintf(stdout, "Compression sampling rate: %" PRId64 "\n", + FLAGS_sample_for_compression); + if (options.memtable_factory != nullptr) { + fprintf(stdout, "Memtablerep: %s\n", + options.memtable_factory->GetId().c_str()); + } + fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level); + + PrintWarnings(compression.c_str()); + fprintf(stdout, "------------------------------------------------\n"); + } + + void PrintWarnings(const char* compression) { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf( + stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif + if (FLAGS_compression_type_e != ROCKSDB_NAMESPACE::kNoCompression) { + // The test string should not be too small. + const int len = FLAGS_block_size; + std::string input_str(len, 'y'); + std::string compressed; + CompressionOptions opts; + CompressionContext context(FLAGS_compression_type_e); + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + FLAGS_compression_type_e, + FLAGS_sample_for_compression); + bool result = CompressSlice(info, Slice(input_str), &compressed); + + if (!result) { + fprintf(stdout, "WARNING: %s compression is not enabled\n", + compression); + } else if (compressed.size() >= input_str.size()) { + fprintf(stdout, "WARNING: %s compression is not effective\n", + compression); + } + } + } + +// Current the following isn't equivalent to OS_LINUX. +#if defined(__linux) + static Slice TrimSpace(Slice s) { + unsigned int start = 0; + while (start < s.size() && isspace(s[start])) { + start++; + } + unsigned int limit = static_cast<unsigned int>(s.size()); + while (limit > start && isspace(s[limit - 1])) { + limit--; + } + return Slice(s.data() + start, limit - start); + } +#endif + + void PrintEnvironment() { + fprintf(stderr, "RocksDB: version %s\n", + GetRocksVersionAsString(true).c_str()); + +#if defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__) + time_t now = time(nullptr); + char buf[52]; + // Lint complains about ctime() usage, so replace it with ctime_r(). The + // requirement is to provide a buffer which is at least 26 bytes. + fprintf(stderr, "Date: %s", + ctime_r(&now, buf)); // ctime_r() adds newline + +#if defined(__linux) + FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo != nullptr) { + char line[1000]; + int num_cpus = 0; + std::string cpu_type; + std::string cache_size; + while (fgets(line, sizeof(line), cpuinfo) != nullptr) { + const char* sep = strchr(line, ':'); + if (sep == nullptr) { + continue; + } + Slice key = TrimSpace(Slice(line, sep - 1 - line)); + Slice val = TrimSpace(Slice(sep + 1)); + if (key == "model name") { + ++num_cpus; + cpu_type = val.ToString(); + } else if (key == "cache size") { + cache_size = val.ToString(); + } + } + fclose(cpuinfo); + fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); + fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); + } +#elif defined(__APPLE__) + struct host_basic_info h; + size_t hlen = HOST_BASIC_INFO_COUNT; + if (host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&h, + (uint32_t*)&hlen) == KERN_SUCCESS) { + std::string cpu_type; + std::string cache_size; + size_t hcache_size; + hlen = sizeof(hcache_size); + if (sysctlbyname("hw.cachelinesize", &hcache_size, &hlen, NULL, 0) == 0) { + cache_size = std::to_string(hcache_size); + } + switch (h.cpu_type) { + case CPU_TYPE_X86_64: + cpu_type = "x86_64"; + break; + case CPU_TYPE_ARM64: + cpu_type = "arm64"; + break; + default: + break; + } + fprintf(stderr, "CPU: %d * %s\n", h.max_cpus, cpu_type.c_str()); + fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); + } +#elif defined(__FreeBSD__) + int ncpus; + size_t len = sizeof(ncpus); + int mib[2] = {CTL_HW, HW_NCPU}; + if (sysctl(mib, 2, &ncpus, &len, nullptr, 0) == 0) { + char cpu_type[16]; + len = sizeof(cpu_type) - 1; + mib[1] = HW_MACHINE; + if (sysctl(mib, 2, cpu_type, &len, nullptr, 0) == 0) cpu_type[len] = 0; + + fprintf(stderr, "CPU: %d * %s\n", ncpus, cpu_type); + // no programmatic way to get the cache line size except on PPC + } +#endif +#endif + } + + static bool KeyExpired(const TimestampEmulator* timestamp_emulator, + const Slice& key) { + const char* pos = key.data(); + pos += 8; + uint64_t timestamp = 0; + if (port::kLittleEndian) { + int bytes_to_fill = 8; + for (int i = 0; i < bytes_to_fill; ++i) { + timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i])) + << ((bytes_to_fill - i - 1) << 3)); + } + } else { + memcpy(×tamp, pos, sizeof(timestamp)); + } + return timestamp_emulator->Get() - timestamp > FLAGS_time_range; + } + + class ExpiredTimeFilter : public CompactionFilter { + public: + explicit ExpiredTimeFilter( + const std::shared_ptr<TimestampEmulator>& timestamp_emulator) + : timestamp_emulator_(timestamp_emulator) {} + bool Filter(int /*level*/, const Slice& key, + const Slice& /*existing_value*/, std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return KeyExpired(timestamp_emulator_.get(), key); + } + const char* Name() const override { return "ExpiredTimeFilter"; } + + private: + std::shared_ptr<TimestampEmulator> timestamp_emulator_; + }; + + class KeepFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return false; + } + + const char* Name() const override { return "KeepFilter"; } + }; + + static std::shared_ptr<MemoryAllocator> GetCacheAllocator() { + std::shared_ptr<MemoryAllocator> allocator; + + if (FLAGS_use_cache_jemalloc_no_dump_allocator) { + JemallocAllocatorOptions jemalloc_options; + if (!NewJemallocNodumpAllocator(jemalloc_options, &allocator).ok()) { + fprintf(stderr, "JemallocNodumpAllocator not supported.\n"); + exit(1); + } + } else if (FLAGS_use_cache_memkind_kmem_allocator) { +#ifdef MEMKIND + allocator = std::make_shared<MemkindKmemAllocator>(); +#else + fprintf(stderr, "Memkind library is not linked with the binary.\n"); + exit(1); +#endif + } + + return allocator; + } + + static std::shared_ptr<Cache> NewCache(int64_t capacity) { + if (capacity <= 0) { + return nullptr; + } + if (FLAGS_cache_type == "clock_cache") { + fprintf(stderr, "Old clock cache implementation has been removed.\n"); + exit(1); + } else if (FLAGS_cache_type == "hyper_clock_cache") { + return HyperClockCacheOptions(static_cast<size_t>(capacity), + FLAGS_block_size /*estimated_entry_charge*/, + FLAGS_cache_numshardbits) + .MakeSharedCache(); + } else if (FLAGS_cache_type == "lru_cache") { + LRUCacheOptions opts( + static_cast<size_t>(capacity), FLAGS_cache_numshardbits, + false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio, + GetCacheAllocator(), kDefaultToAdaptiveMutex, + kDefaultCacheMetadataChargePolicy, FLAGS_cache_low_pri_pool_ratio); + +#ifndef ROCKSDB_LITE + if (!FLAGS_secondary_cache_uri.empty()) { + Status s = SecondaryCache::CreateFromString( + ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache); + if (secondary_cache == nullptr) { + fprintf( + stderr, + "No secondary cache registered matching string: %s status=%s\n", + FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str()); + exit(1); + } + opts.secondary_cache = secondary_cache; + } +#endif // ROCKSDB_LITE + + if (FLAGS_use_compressed_secondary_cache) { + CompressedSecondaryCacheOptions secondary_cache_opts; + secondary_cache_opts.capacity = FLAGS_compressed_secondary_cache_size; + secondary_cache_opts.num_shard_bits = + FLAGS_compressed_secondary_cache_numshardbits; + secondary_cache_opts.high_pri_pool_ratio = + FLAGS_compressed_secondary_cache_high_pri_pool_ratio; + secondary_cache_opts.low_pri_pool_ratio = + FLAGS_compressed_secondary_cache_low_pri_pool_ratio; + secondary_cache_opts.compression_type = + FLAGS_compressed_secondary_cache_compression_type_e; + secondary_cache_opts.compress_format_version = + FLAGS_compressed_secondary_cache_compress_format_version; + opts.secondary_cache = + NewCompressedSecondaryCache(secondary_cache_opts); + } + + return NewLRUCache(opts); + } else { + fprintf(stderr, "Cache type not supported."); + exit(1); + } + } + + public: + Benchmark() + : cache_(NewCache(FLAGS_cache_size)), + compressed_cache_(NewCache(FLAGS_compressed_cache_size)), + prefix_extractor_(FLAGS_prefix_size != 0 + ? NewFixedPrefixTransform(FLAGS_prefix_size) + : nullptr), + num_(FLAGS_num), + key_size_(FLAGS_key_size), + user_timestamp_size_(FLAGS_user_timestamp_size), + prefix_size_(FLAGS_prefix_size), + total_thread_count_(0), + keys_per_prefix_(FLAGS_keys_per_prefix), + entries_per_batch_(1), + reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads), + read_random_exp_range_(0.0), + writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes), + readwrites_( + (FLAGS_writes < 0 && FLAGS_reads < 0) + ? FLAGS_num + : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)), + merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys), + report_file_operations_(FLAGS_report_file_operations), +#ifndef ROCKSDB_LITE + use_blob_db_(FLAGS_use_blob_db), // Stacked BlobDB +#else + use_blob_db_(false), // Stacked BlobDB +#endif // !ROCKSDB_LITE + read_operands_(false) { + // use simcache instead of cache + if (FLAGS_simcache_size >= 0) { + if (FLAGS_cache_numshardbits >= 1) { + cache_ = + NewSimCache(cache_, FLAGS_simcache_size, FLAGS_cache_numshardbits); + } else { + cache_ = NewSimCache(cache_, FLAGS_simcache_size, 0); + } + } + + if (report_file_operations_) { + FLAGS_env = new CompositeEnvWrapper( + FLAGS_env, + std::make_shared<CountedFileSystem>(FLAGS_env->GetFileSystem())); + } + + if (FLAGS_prefix_size > FLAGS_key_size) { + fprintf(stderr, "prefix size is larger than key size"); + exit(1); + } + + std::vector<std::string> files; + FLAGS_env->GetChildren(FLAGS_db, &files); + for (size_t i = 0; i < files.size(); i++) { + if (Slice(files[i]).starts_with("heap-")) { + FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]); + } + } + if (!FLAGS_use_existing_db) { + Options options; + options.env = FLAGS_env; + if (!FLAGS_wal_dir.empty()) { + options.wal_dir = FLAGS_wal_dir; + } +#ifndef ROCKSDB_LITE + if (use_blob_db_) { + // Stacked BlobDB + blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions()); + } +#endif // !ROCKSDB_LITE + DestroyDB(FLAGS_db, options); + if (!FLAGS_wal_dir.empty()) { + FLAGS_env->DeleteDir(FLAGS_wal_dir); + } + + if (FLAGS_num_multi_db > 1) { + FLAGS_env->CreateDir(FLAGS_db); + if (!FLAGS_wal_dir.empty()) { + FLAGS_env->CreateDir(FLAGS_wal_dir); + } + } + } + + listener_.reset(new ErrorHandlerListener()); + if (user_timestamp_size_ > 0) { + mock_app_clock_.reset(new TimestampEmulator()); + } + } + + void DeleteDBs() { + db_.DeleteDBs(); + for (const DBWithColumnFamilies& dbwcf : multi_dbs_) { + delete dbwcf.db; + } + } + + ~Benchmark() { + DeleteDBs(); + if (cache_.get() != nullptr) { + // Clear cache reference first + open_options_.write_buffer_manager.reset(); + // this will leak, but we're shutting down so nobody cares + cache_->DisownData(); + } + } + + Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) { + char* data = new char[key_size_]; + const char* const_data = data; + key_guard->reset(const_data); + return Slice(key_guard->get(), key_size_); + } + + // Generate key according to the given specification and random number. + // The resulting key will have the following format: + // - If keys_per_prefix_ is positive, extra trailing bytes are either cut + // off or padded with '0'. + // The prefix value is derived from key value. + // ---------------------------- + // | prefix 00000 | key 00000 | + // ---------------------------- + // + // - If keys_per_prefix_ is 0, the key is simply a binary representation of + // random number followed by trailing '0's + // ---------------------------- + // | key 00000 | + // ---------------------------- + void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) { + if (!keys_.empty()) { + assert(FLAGS_use_existing_keys); + assert(keys_.size() == static_cast<size_t>(num_keys)); + assert(v < static_cast<uint64_t>(num_keys)); + *key = keys_[v]; + return; + } + char* start = const_cast<char*>(key->data()); + char* pos = start; + if (keys_per_prefix_ > 0) { + int64_t num_prefix = num_keys / keys_per_prefix_; + int64_t prefix = v % num_prefix; + int bytes_to_fill = std::min(prefix_size_, 8); + if (port::kLittleEndian) { + for (int i = 0; i < bytes_to_fill; ++i) { + pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF; + } + } else { + memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill); + } + if (prefix_size_ > 8) { + // fill the rest with 0s + memset(pos + 8, '0', prefix_size_ - 8); + } + pos += prefix_size_; + } + + int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8); + if (port::kLittleEndian) { + for (int i = 0; i < bytes_to_fill; ++i) { + pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF; + } + } else { + memcpy(pos, static_cast<void*>(&v), bytes_to_fill); + } + pos += bytes_to_fill; + if (key_size_ > pos - start) { + memset(pos, '0', key_size_ - (pos - start)); + } + } + + void GenerateKeyFromIntForSeek(uint64_t v, int64_t num_keys, Slice* key) { + GenerateKeyFromInt(v, num_keys, key); + if (FLAGS_seek_missing_prefix) { + assert(prefix_size_ > 8); + char* key_ptr = const_cast<char*>(key->data()); + // This rely on GenerateKeyFromInt filling paddings with '0's. + // Putting a '1' will create a non-existing prefix. + key_ptr[8] = '1'; + } + } + + std::string GetPathForMultiple(std::string base_name, size_t id) { + if (!base_name.empty()) { +#ifndef OS_WIN + if (base_name.back() != '/') { + base_name += '/'; + } +#else + if (base_name.back() != '\\') { + base_name += '\\'; + } +#endif + } + return base_name + std::to_string(id); + } + + void VerifyDBFromDB(std::string& truth_db_name) { + DBWithColumnFamilies truth_db; + auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db); + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + ReadOptions ro; + ro.total_order_seek = true; + std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro)); + std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro)); + // Verify that all the key/values in truth_db are retrivable in db with + // ::Get + fprintf(stderr, "Verifying db >= truth_db with ::Get...\n"); + for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) { + std::string value; + s = db_.db->Get(ro, truth_iter->key(), &value); + assert(s.ok()); + // TODO(myabandeh): provide debugging hints + assert(Slice(value) == truth_iter->value()); + } + // Verify that the db iterator does not give any extra key/value + fprintf(stderr, "Verifying db == truth_db...\n"); + for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid(); + db_iter->Next(), truth_iter->Next()) { + assert(truth_iter->Valid()); + assert(truth_iter->value() == db_iter->value()); + } + // No more key should be left unchecked in truth_db + assert(!truth_iter->Valid()); + fprintf(stderr, "...Verified\n"); + } + + void ErrorExit() { + DeleteDBs(); + exit(1); + } + + void Run() { + if (!SanityCheck()) { + ErrorExit(); + } + Open(&open_options_); + PrintHeader(open_options_); + std::stringstream benchmark_stream(FLAGS_benchmarks); + std::string name; + std::unique_ptr<ExpiredTimeFilter> filter; + while (std::getline(benchmark_stream, name, ',')) { + // Sanitize parameters + num_ = FLAGS_num; + reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads); + writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes); + deletes_ = (FLAGS_deletes < 0 ? FLAGS_num : FLAGS_deletes); + value_size = FLAGS_value_size; + key_size_ = FLAGS_key_size; + entries_per_batch_ = FLAGS_batch_size; + writes_before_delete_range_ = FLAGS_writes_before_delete_range; + writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone; + range_tombstone_width_ = FLAGS_range_tombstone_width; + max_num_range_tombstones_ = FLAGS_max_num_range_tombstones; + write_options_ = WriteOptions(); + read_random_exp_range_ = FLAGS_read_random_exp_range; + if (FLAGS_sync) { + write_options_.sync = true; + } + write_options_.disableWAL = FLAGS_disable_wal; + write_options_.rate_limiter_priority = + FLAGS_rate_limit_auto_wal_flush ? Env::IO_USER : Env::IO_TOTAL; + read_options_ = ReadOptions(FLAGS_verify_checksum, true); + read_options_.total_order_seek = FLAGS_total_order_seek; + read_options_.prefix_same_as_start = FLAGS_prefix_same_as_start; + read_options_.rate_limiter_priority = + FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL; + read_options_.tailing = FLAGS_use_tailing_iterator; + read_options_.readahead_size = FLAGS_readahead_size; + read_options_.adaptive_readahead = FLAGS_adaptive_readahead; + read_options_.async_io = FLAGS_async_io; + read_options_.optimize_multiget_for_io = FLAGS_optimize_multiget_for_io; + + void (Benchmark::*method)(ThreadState*) = nullptr; + void (Benchmark::*post_process_method)() = nullptr; + + bool fresh_db = false; + int num_threads = FLAGS_threads; + + int num_repeat = 1; + int num_warmup = 0; + if (!name.empty() && *name.rbegin() == ']') { + auto it = name.find('['); + if (it == std::string::npos) { + fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str()); + ErrorExit(); + } + std::string args = name.substr(it + 1); + args.resize(args.size() - 1); + name.resize(it); + + std::string bench_arg; + std::stringstream args_stream(args); + while (std::getline(args_stream, bench_arg, '-')) { + if (bench_arg.empty()) { + continue; + } + if (bench_arg[0] == 'X') { + // Repeat the benchmark n times + std::string num_str = bench_arg.substr(1); + num_repeat = std::stoi(num_str); + } else if (bench_arg[0] == 'W') { + // Warm up the benchmark for n times + std::string num_str = bench_arg.substr(1); + num_warmup = std::stoi(num_str); + } + } + } + + // Both fillseqdeterministic and filluniquerandomdeterministic + // fill the levels except the max level with UNIQUE_RANDOM + // and fill the max level with fillseq and filluniquerandom, respectively + if (name == "fillseqdeterministic" || + name == "filluniquerandomdeterministic") { + if (!FLAGS_disable_auto_compactions) { + fprintf(stderr, + "Please disable_auto_compactions in FillDeterministic " + "benchmark\n"); + ErrorExit(); + } + if (num_threads > 1) { + fprintf(stderr, + "filldeterministic multithreaded not supported" + ", use 1 thread\n"); + num_threads = 1; + } + fresh_db = true; + if (name == "fillseqdeterministic") { + method = &Benchmark::WriteSeqDeterministic; + } else { + method = &Benchmark::WriteUniqueRandomDeterministic; + } + } else if (name == "fillseq") { + fresh_db = true; + method = &Benchmark::WriteSeq; + } else if (name == "fillbatch") { + fresh_db = true; + entries_per_batch_ = 1000; + method = &Benchmark::WriteSeq; + } else if (name == "fillrandom") { + fresh_db = true; + method = &Benchmark::WriteRandom; + } else if (name == "filluniquerandom" || + name == "fillanddeleteuniquerandom") { + fresh_db = true; + if (num_threads > 1) { + fprintf(stderr, + "filluniquerandom and fillanddeleteuniquerandom " + "multithreaded not supported, use 1 thread"); + num_threads = 1; + } + method = &Benchmark::WriteUniqueRandom; + } else if (name == "overwrite") { + method = &Benchmark::WriteRandom; + } else if (name == "fillsync") { + fresh_db = true; + num_ /= 1000; + write_options_.sync = true; + method = &Benchmark::WriteRandom; + } else if (name == "fill100K") { + fresh_db = true; + num_ /= 1000; + value_size = 100 * 1000; + method = &Benchmark::WriteRandom; + } else if (name == "readseq") { + method = &Benchmark::ReadSequential; + } else if (name == "readtorowcache") { + if (!FLAGS_use_existing_keys || !FLAGS_row_cache_size) { + fprintf(stderr, + "Please set use_existing_keys to true and specify a " + "row cache size in readtorowcache benchmark\n"); + ErrorExit(); + } + method = &Benchmark::ReadToRowCache; + } else if (name == "readtocache") { + method = &Benchmark::ReadSequential; + num_threads = 1; + reads_ = num_; + } else if (name == "readreverse") { + method = &Benchmark::ReadReverse; + } else if (name == "readrandom") { + if (FLAGS_multiread_stride) { + fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", + entries_per_batch_); + } + method = &Benchmark::ReadRandom; + } else if (name == "readrandomfast") { + method = &Benchmark::ReadRandomFast; + } else if (name == "multireadrandom") { + fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", + entries_per_batch_); + method = &Benchmark::MultiReadRandom; + } else if (name == "multireadwhilewriting") { + fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", + entries_per_batch_); + num_threads++; + method = &Benchmark::MultiReadWhileWriting; + } else if (name == "approximatesizerandom") { + fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", + entries_per_batch_); + method = &Benchmark::ApproximateSizeRandom; + } else if (name == "mixgraph") { + method = &Benchmark::MixGraph; + } else if (name == "readmissing") { + ++key_size_; + method = &Benchmark::ReadRandom; + } else if (name == "newiterator") { + method = &Benchmark::IteratorCreation; + } else if (name == "newiteratorwhilewriting") { + num_threads++; // Add extra thread for writing + method = &Benchmark::IteratorCreationWhileWriting; + } else if (name == "seekrandom") { + method = &Benchmark::SeekRandom; + } else if (name == "seekrandomwhilewriting") { + num_threads++; // Add extra thread for writing + method = &Benchmark::SeekRandomWhileWriting; + } else if (name == "seekrandomwhilemerging") { + num_threads++; // Add extra thread for merging + method = &Benchmark::SeekRandomWhileMerging; + } else if (name == "readrandomsmall") { + reads_ /= 1000; + method = &Benchmark::ReadRandom; + } else if (name == "deleteseq") { + method = &Benchmark::DeleteSeq; + } else if (name == "deleterandom") { + method = &Benchmark::DeleteRandom; + } else if (name == "readwhilewriting") { + num_threads++; // Add extra thread for writing + method = &Benchmark::ReadWhileWriting; + } else if (name == "readwhilemerging") { + num_threads++; // Add extra thread for writing + method = &Benchmark::ReadWhileMerging; + } else if (name == "readwhilescanning") { + num_threads++; // Add extra thread for scaning + method = &Benchmark::ReadWhileScanning; + } else if (name == "readrandomwriterandom") { + method = &Benchmark::ReadRandomWriteRandom; + } else if (name == "readrandommergerandom") { + if (FLAGS_merge_operator.empty()) { + fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", + name.c_str()); + ErrorExit(); + } + method = &Benchmark::ReadRandomMergeRandom; + } else if (name == "updaterandom") { + method = &Benchmark::UpdateRandom; + } else if (name == "xorupdaterandom") { + method = &Benchmark::XORUpdateRandom; + } else if (name == "appendrandom") { + method = &Benchmark::AppendRandom; + } else if (name == "mergerandom") { + if (FLAGS_merge_operator.empty()) { + fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", + name.c_str()); + exit(1); + } + method = &Benchmark::MergeRandom; + } else if (name == "randomwithverify") { + method = &Benchmark::RandomWithVerify; + } else if (name == "fillseekseq") { + method = &Benchmark::WriteSeqSeekSeq; + } else if (name == "compact") { + method = &Benchmark::Compact; + } else if (name == "compactall") { + CompactAll(); +#ifndef ROCKSDB_LITE + } else if (name == "compact0") { + CompactLevel(0); + } else if (name == "compact1") { + CompactLevel(1); + } else if (name == "waitforcompaction") { + WaitForCompaction(); +#endif + } else if (name == "flush") { + Flush(); + } else if (name == "crc32c") { + method = &Benchmark::Crc32c; + } else if (name == "xxhash") { + method = &Benchmark::xxHash; + } else if (name == "xxhash64") { + method = &Benchmark::xxHash64; + } else if (name == "xxh3") { + method = &Benchmark::xxh3; + } else if (name == "acquireload") { + method = &Benchmark::AcquireLoad; + } else if (name == "compress") { + method = &Benchmark::Compress; + } else if (name == "uncompress") { + method = &Benchmark::Uncompress; +#ifndef ROCKSDB_LITE + } else if (name == "randomtransaction") { + method = &Benchmark::RandomTransaction; + post_process_method = &Benchmark::RandomTransactionVerify; +#endif // ROCKSDB_LITE + } else if (name == "randomreplacekeys") { + fresh_db = true; + method = &Benchmark::RandomReplaceKeys; + } else if (name == "timeseries") { + timestamp_emulator_.reset(new TimestampEmulator()); + if (FLAGS_expire_style == "compaction_filter") { + filter.reset(new ExpiredTimeFilter(timestamp_emulator_)); + fprintf(stdout, "Compaction filter is used to remove expired data"); + open_options_.compaction_filter = filter.get(); + } + fresh_db = true; + method = &Benchmark::TimeSeries; + } else if (name == "block_cache_entry_stats") { + // DB::Properties::kBlockCacheEntryStats + PrintStats("rocksdb.block-cache-entry-stats"); + } else if (name == "stats") { + PrintStats("rocksdb.stats"); + } else if (name == "resetstats") { + ResetStats(); + } else if (name == "verify") { + VerifyDBFromDB(FLAGS_truth_db); + } else if (name == "levelstats") { + PrintStats("rocksdb.levelstats"); + } else if (name == "memstats") { + std::vector<std::string> keys{"rocksdb.num-immutable-mem-table", + "rocksdb.cur-size-active-mem-table", + "rocksdb.cur-size-all-mem-tables", + "rocksdb.size-all-mem-tables", + "rocksdb.num-entries-active-mem-table", + "rocksdb.num-entries-imm-mem-tables"}; + PrintStats(keys); + } else if (name == "sstables") { + PrintStats("rocksdb.sstables"); + } else if (name == "stats_history") { + PrintStatsHistory(); +#ifndef ROCKSDB_LITE + } else if (name == "replay") { + if (num_threads > 1) { + fprintf(stderr, "Multi-threaded replay is not yet supported\n"); + ErrorExit(); + } + if (FLAGS_trace_file == "") { + fprintf(stderr, "Please set --trace_file to be replayed from\n"); + ErrorExit(); + } + method = &Benchmark::Replay; +#endif // ROCKSDB_LITE + } else if (name == "getmergeoperands") { + method = &Benchmark::GetMergeOperands; +#ifndef ROCKSDB_LITE + } else if (name == "verifychecksum") { + method = &Benchmark::VerifyChecksum; + } else if (name == "verifyfilechecksums") { + method = &Benchmark::VerifyFileChecksums; +#endif // ROCKSDB_LITE + } else if (name == "readrandomoperands") { + read_operands_ = true; + method = &Benchmark::ReadRandom; +#ifndef ROCKSDB_LITE + } else if (name == "backup") { + method = &Benchmark::Backup; + } else if (name == "restore") { + method = &Benchmark::Restore; +#endif + } else if (!name.empty()) { // No error message for empty name + fprintf(stderr, "unknown benchmark '%s'\n", name.c_str()); + ErrorExit(); + } + + if (fresh_db) { + if (FLAGS_use_existing_db) { + fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n", + name.c_str()); + method = nullptr; + } else { + if (db_.db != nullptr) { + db_.DeleteDBs(); + DestroyDB(FLAGS_db, open_options_); + } + Options options = open_options_; + for (size_t i = 0; i < multi_dbs_.size(); i++) { + delete multi_dbs_[i].db; + if (!open_options_.wal_dir.empty()) { + options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i); + } + DestroyDB(GetPathForMultiple(FLAGS_db, i), options); + } + multi_dbs_.clear(); + } + Open(&open_options_); // use open_options for the last accessed + } + + if (method != nullptr) { + fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); + +#ifndef ROCKSDB_LITE + if (name == "backup") { + std::cout << "Backup path: [" << FLAGS_backup_dir << "]" << std::endl; + } else if (name == "restore") { + std::cout << "Backup path: [" << FLAGS_backup_dir << "]" << std::endl; + std::cout << "Restore path: [" << FLAGS_restore_dir << "]" + << std::endl; + } + // A trace_file option can be provided both for trace and replay + // operations. But db_bench does not support tracing and replaying at + // the same time, for now. So, start tracing only when it is not a + // replay. + if (FLAGS_trace_file != "" && name != "replay") { + std::unique_ptr<TraceWriter> trace_writer; + Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(), + FLAGS_trace_file, &trace_writer); + if (!s.ok()) { + fprintf(stderr, "Encountered an error starting a trace, %s\n", + s.ToString().c_str()); + ErrorExit(); + } + s = db_.db->StartTrace(trace_options_, std::move(trace_writer)); + if (!s.ok()) { + fprintf(stderr, "Encountered an error starting a trace, %s\n", + s.ToString().c_str()); + ErrorExit(); + } + fprintf(stdout, "Tracing the workload to: [%s]\n", + FLAGS_trace_file.c_str()); + } + // Start block cache tracing. + if (!FLAGS_block_cache_trace_file.empty()) { + // Sanity checks. + if (FLAGS_block_cache_trace_sampling_frequency <= 0) { + fprintf(stderr, + "Block cache trace sampling frequency must be higher than " + "0.\n"); + ErrorExit(); + } + if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) { + fprintf(stderr, + "The maximum file size for block cache tracing must be " + "higher than 0.\n"); + ErrorExit(); + } + block_cache_trace_options_.max_trace_file_size = + FLAGS_block_cache_trace_max_trace_file_size_in_bytes; + block_cache_trace_options_.sampling_frequency = + FLAGS_block_cache_trace_sampling_frequency; + std::unique_ptr<TraceWriter> block_cache_trace_writer; + Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(), + FLAGS_block_cache_trace_file, + &block_cache_trace_writer); + if (!s.ok()) { + fprintf(stderr, + "Encountered an error when creating trace writer, %s\n", + s.ToString().c_str()); + ErrorExit(); + } + s = db_.db->StartBlockCacheTrace(block_cache_trace_options_, + std::move(block_cache_trace_writer)); + if (!s.ok()) { + fprintf( + stderr, + "Encountered an error when starting block cache tracing, %s\n", + s.ToString().c_str()); + ErrorExit(); + } + fprintf(stdout, "Tracing block cache accesses to: [%s]\n", + FLAGS_block_cache_trace_file.c_str()); + } +#endif // ROCKSDB_LITE + + if (num_warmup > 0) { + printf("Warming up benchmark by running %d times\n", num_warmup); + } + + for (int i = 0; i < num_warmup; i++) { + RunBenchmark(num_threads, name, method); + } + + if (num_repeat > 1) { + printf("Running benchmark for %d times\n", num_repeat); + } + + CombinedStats combined_stats; + for (int i = 0; i < num_repeat; i++) { + Stats stats = RunBenchmark(num_threads, name, method); + combined_stats.AddStats(stats); + if (FLAGS_confidence_interval_only) { + combined_stats.ReportWithConfidenceIntervals(name); + } else { + combined_stats.Report(name); + } + } + if (num_repeat > 1) { + combined_stats.ReportFinal(name); + } + } + if (post_process_method != nullptr) { + (this->*post_process_method)(); + } + } + + if (secondary_update_thread_) { + secondary_update_stopped_.store(1, std::memory_order_relaxed); + secondary_update_thread_->join(); + secondary_update_thread_.reset(); + } + +#ifndef ROCKSDB_LITE + if (name != "replay" && FLAGS_trace_file != "") { + Status s = db_.db->EndTrace(); + if (!s.ok()) { + fprintf(stderr, "Encountered an error ending the trace, %s\n", + s.ToString().c_str()); + } + } + if (!FLAGS_block_cache_trace_file.empty()) { + Status s = db_.db->EndBlockCacheTrace(); + if (!s.ok()) { + fprintf(stderr, + "Encountered an error ending the block cache tracing, %s\n", + s.ToString().c_str()); + } + } +#endif // ROCKSDB_LITE + + if (FLAGS_statistics) { + fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str()); + } + if (FLAGS_simcache_size >= 0) { + fprintf( + stdout, "SIMULATOR CACHE STATISTICS:\n%s\n", + static_cast_with_check<SimCache>(cache_.get())->ToString().c_str()); + } + +#ifndef ROCKSDB_LITE + if (FLAGS_use_secondary_db) { + fprintf(stdout, "Secondary instance updated %" PRIu64 " times.\n", + secondary_db_updates_); + } +#endif // ROCKSDB_LITE + } + + private: + std::shared_ptr<TimestampEmulator> timestamp_emulator_; + std::unique_ptr<port::Thread> secondary_update_thread_; + std::atomic<int> secondary_update_stopped_{0}; +#ifndef ROCKSDB_LITE + uint64_t secondary_db_updates_ = 0; +#endif // ROCKSDB_LITE + struct ThreadArg { + Benchmark* bm; + SharedState* shared; + ThreadState* thread; + void (Benchmark::*method)(ThreadState*); + }; + + static void ThreadBody(void* v) { + ThreadArg* arg = reinterpret_cast<ThreadArg*>(v); + SharedState* shared = arg->shared; + ThreadState* thread = arg->thread; + { + MutexLock l(&shared->mu); + shared->num_initialized++; + if (shared->num_initialized >= shared->total) { + shared->cv.SignalAll(); + } + while (!shared->start) { + shared->cv.Wait(); + } + } + + SetPerfLevel(static_cast<PerfLevel>(shared->perf_level)); + perf_context.EnablePerLevelPerfContext(); + thread->stats.Start(thread->tid); + (arg->bm->*(arg->method))(thread); + if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) { + thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") + + get_perf_context()->ToString()); + } + thread->stats.Stop(); + + { + MutexLock l(&shared->mu); + shared->num_done++; + if (shared->num_done >= shared->total) { + shared->cv.SignalAll(); + } + } + } + + Stats RunBenchmark(int n, Slice name, + void (Benchmark::*method)(ThreadState*)) { + SharedState shared; + shared.total = n; + shared.num_initialized = 0; + shared.num_done = 0; + shared.start = false; + if (FLAGS_benchmark_write_rate_limit > 0) { + shared.write_rate_limiter.reset( + NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit)); + } + if (FLAGS_benchmark_read_rate_limit > 0) { + shared.read_rate_limiter.reset(NewGenericRateLimiter( + FLAGS_benchmark_read_rate_limit, 100000 /* refill_period_us */, + 10 /* fairness */, RateLimiter::Mode::kReadsOnly)); + } + + std::unique_ptr<ReporterAgent> reporter_agent; + if (FLAGS_report_interval_seconds > 0) { + reporter_agent.reset(new ReporterAgent(FLAGS_env, FLAGS_report_file, + FLAGS_report_interval_seconds)); + } + + ThreadArg* arg = new ThreadArg[n]; + + for (int i = 0; i < n; i++) { +#ifdef NUMA + if (FLAGS_enable_numa) { + // Performs a local allocation of memory to threads in numa node. + int n_nodes = numa_num_task_nodes(); // Number of nodes in NUMA. + numa_exit_on_error = 1; + int numa_node = i % n_nodes; + bitmask* nodes = numa_allocate_nodemask(); + numa_bitmask_clearall(nodes); + numa_bitmask_setbit(nodes, numa_node); + // numa_bind() call binds the process to the node and these + // properties are passed on to the thread that is created in + // StartThread method called later in the loop. + numa_bind(nodes); + numa_set_strict(1); + numa_free_nodemask(nodes); + } +#endif + arg[i].bm = this; + arg[i].method = method; + arg[i].shared = &shared; + total_thread_count_++; + arg[i].thread = new ThreadState(i, total_thread_count_); + arg[i].thread->stats.SetReporterAgent(reporter_agent.get()); + arg[i].thread->shared = &shared; + FLAGS_env->StartThread(ThreadBody, &arg[i]); + } + + shared.mu.Lock(); + while (shared.num_initialized < n) { + shared.cv.Wait(); + } + + shared.start = true; + shared.cv.SignalAll(); + while (shared.num_done < n) { + shared.cv.Wait(); + } + shared.mu.Unlock(); + + // Stats for some threads can be excluded. + Stats merge_stats; + for (int i = 0; i < n; i++) { + merge_stats.Merge(arg[i].thread->stats); + } + merge_stats.Report(name); + + for (int i = 0; i < n; i++) { + delete arg[i].thread; + } + delete[] arg; + + return merge_stats; + } + + template <OperationType kOpType, typename FnType, typename... Args> + static inline void ChecksumBenchmark(FnType fn, ThreadState* thread, + Args... args) { + const int size = FLAGS_block_size; // use --block_size option for db_bench + std::string labels = "(" + std::to_string(FLAGS_block_size) + " per op)"; + const char* label = labels.c_str(); + + std::string data(size, 'x'); + uint64_t bytes = 0; + uint32_t val = 0; + while (bytes < 5000U * uint64_t{1048576}) { // ~5GB + val += static_cast<uint32_t>(fn(data.data(), size, args...)); + thread->stats.FinishedOps(nullptr, nullptr, 1, kOpType); + bytes += size; + } + // Print so result is not dead + fprintf(stderr, "... val=0x%x\r", static_cast<unsigned int>(val)); + + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(label); + } + + void Crc32c(ThreadState* thread) { + ChecksumBenchmark<kCrc>(crc32c::Value, thread); + } + + void xxHash(ThreadState* thread) { + ChecksumBenchmark<kHash>(XXH32, thread, /*seed*/ 0); + } + + void xxHash64(ThreadState* thread) { + ChecksumBenchmark<kHash>(XXH64, thread, /*seed*/ 0); + } + + void xxh3(ThreadState* thread) { + ChecksumBenchmark<kHash>(XXH3_64bits, thread); + } + + void AcquireLoad(ThreadState* thread) { + int dummy; + std::atomic<void*> ap(&dummy); + int count = 0; + void* ptr = nullptr; + thread->stats.AddMessage("(each op is 1000 loads)"); + while (count < 100000) { + for (int i = 0; i < 1000; i++) { + ptr = ap.load(std::memory_order_acquire); + } + count++; + thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers); + } + if (ptr == nullptr) exit(1); // Disable unused variable warning. + } + + void Compress(ThreadState* thread) { + RandomGenerator gen; + Slice input = gen.Generate(FLAGS_block_size); + int64_t bytes = 0; + int64_t produced = 0; + bool ok = true; + std::string compressed; + CompressionOptions opts; + CompressionContext context(FLAGS_compression_type_e); + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + FLAGS_compression_type_e, + FLAGS_sample_for_compression); + // Compress 1G + while (ok && bytes < int64_t(1) << 30) { + compressed.clear(); + ok = CompressSlice(info, input, &compressed); + produced += compressed.size(); + bytes += input.size(); + thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress); + } + + if (!ok) { + thread->stats.AddMessage("(compression failure)"); + } else { + char buf[340]; + snprintf(buf, sizeof(buf), "(output: %.1f%%)", + (produced * 100.0) / bytes); + thread->stats.AddMessage(buf); + thread->stats.AddBytes(bytes); + } + } + + void Uncompress(ThreadState* thread) { + RandomGenerator gen; + Slice input = gen.Generate(FLAGS_block_size); + std::string compressed; + + CompressionContext compression_ctx(FLAGS_compression_type_e); + CompressionOptions compression_opts; + CompressionInfo compression_info( + compression_opts, compression_ctx, CompressionDict::GetEmptyDict(), + FLAGS_compression_type_e, FLAGS_sample_for_compression); + UncompressionContext uncompression_ctx(FLAGS_compression_type_e); + UncompressionInfo uncompression_info(uncompression_ctx, + UncompressionDict::GetEmptyDict(), + FLAGS_compression_type_e); + + bool ok = CompressSlice(compression_info, input, &compressed); + int64_t bytes = 0; + size_t uncompressed_size = 0; + while (ok && bytes < 1024 * 1048576) { + constexpr uint32_t compress_format_version = 2; + + CacheAllocationPtr uncompressed = UncompressData( + uncompression_info, compressed.data(), compressed.size(), + &uncompressed_size, compress_format_version); + + ok = uncompressed.get() != nullptr; + bytes += input.size(); + thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress); + } + + if (!ok) { + thread->stats.AddMessage("(compression failure)"); + } else { + thread->stats.AddBytes(bytes); + } + } + + // Returns true if the options is initialized from the specified + // options file. + bool InitializeOptionsFromFile(Options* opts) { +#ifndef ROCKSDB_LITE + printf("Initializing RocksDB Options from the specified file\n"); + DBOptions db_opts; + std::vector<ColumnFamilyDescriptor> cf_descs; + if (FLAGS_options_file != "") { + auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts, + &cf_descs); + db_opts.env = FLAGS_env; + if (s.ok()) { + *opts = Options(db_opts, cf_descs[0].options); + return true; + } + fprintf(stderr, "Unable to load options file %s --- %s\n", + FLAGS_options_file.c_str(), s.ToString().c_str()); + exit(1); + } +#else + (void)opts; +#endif + return false; + } + + void InitializeOptionsFromFlags(Options* opts) { + printf("Initializing RocksDB Options from command-line flags\n"); + Options& options = *opts; + ConfigOptions config_options(options); + config_options.ignore_unsupported_options = false; + + assert(db_.db == nullptr); + + options.env = FLAGS_env; + options.wal_dir = FLAGS_wal_dir; + options.dump_malloc_stats = FLAGS_dump_malloc_stats; + options.stats_dump_period_sec = + static_cast<unsigned int>(FLAGS_stats_dump_period_sec); + options.stats_persist_period_sec = + static_cast<unsigned int>(FLAGS_stats_persist_period_sec); + options.persist_stats_to_disk = FLAGS_persist_stats_to_disk; + options.stats_history_buffer_size = + static_cast<size_t>(FLAGS_stats_history_buffer_size); + options.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery; + + options.compression_opts.level = FLAGS_compression_level; + options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes; + options.compression_opts.zstd_max_train_bytes = + FLAGS_compression_zstd_max_train_bytes; + options.compression_opts.parallel_threads = + FLAGS_compression_parallel_threads; + options.compression_opts.max_dict_buffer_bytes = + FLAGS_compression_max_dict_buffer_bytes; + options.compression_opts.use_zstd_dict_trainer = + FLAGS_compression_use_zstd_dict_trainer; + + options.max_open_files = FLAGS_open_files; + if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) { + options.write_buffer_manager.reset( + new WriteBufferManager(FLAGS_db_write_buffer_size, cache_)); + } + options.arena_block_size = FLAGS_arena_block_size; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + options.max_write_buffer_number_to_maintain = + FLAGS_max_write_buffer_number_to_maintain; + options.max_write_buffer_size_to_maintain = + FLAGS_max_write_buffer_size_to_maintain; + options.max_background_jobs = FLAGS_max_background_jobs; + options.max_background_compactions = FLAGS_max_background_compactions; + options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions); + options.max_background_flushes = FLAGS_max_background_flushes; + options.compaction_style = FLAGS_compaction_style_e; + options.compaction_pri = FLAGS_compaction_pri_e; + options.allow_mmap_reads = FLAGS_mmap_read; + options.allow_mmap_writes = FLAGS_mmap_write; + options.use_direct_reads = FLAGS_use_direct_reads; + options.use_direct_io_for_flush_and_compaction = + FLAGS_use_direct_io_for_flush_and_compaction; + options.manual_wal_flush = FLAGS_manual_wal_flush; + options.wal_compression = FLAGS_wal_compression_e; +#ifndef ROCKSDB_LITE + options.ttl = FLAGS_fifo_compaction_ttl; + options.compaction_options_fifo = CompactionOptionsFIFO( + FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024, + FLAGS_fifo_compaction_allow_compaction); + options.compaction_options_fifo.age_for_warm = FLAGS_fifo_age_for_warm; +#endif // ROCKSDB_LITE + options.prefix_extractor = prefix_extractor_; + if (FLAGS_use_uint64_comparator) { + options.comparator = test::Uint64Comparator(); + if (FLAGS_key_size != 8) { + fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n"); + exit(1); + } + } + if (FLAGS_use_stderr_info_logger) { + options.info_log.reset(new StderrLogger()); + } + options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0; + options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio; + options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering; + if (FLAGS_memtable_insert_with_hint_prefix_size > 0) { + options.memtable_insert_with_hint_prefix_extractor.reset( + NewCappedPrefixTransform( + FLAGS_memtable_insert_with_hint_prefix_size)); + } + options.bloom_locality = FLAGS_bloom_locality; + options.max_file_opening_threads = FLAGS_file_opening_threads; + options.compaction_readahead_size = FLAGS_compaction_readahead_size; + options.log_readahead_size = FLAGS_log_readahead_size; + options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size; + options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size; + options.use_fsync = FLAGS_use_fsync; + options.num_levels = FLAGS_num_levels; + options.target_file_size_base = FLAGS_target_file_size_base; + options.target_file_size_multiplier = FLAGS_target_file_size_multiplier; + options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; + options.level_compaction_dynamic_level_bytes = + FLAGS_level_compaction_dynamic_level_bytes; + options.max_bytes_for_level_multiplier = + FLAGS_max_bytes_for_level_multiplier; + Status s = + CreateMemTableRepFactory(config_options, &options.memtable_factory); + if (!s.ok()) { + fprintf(stderr, "Could not create memtable factory: %s\n", + s.ToString().c_str()); + exit(1); + } else if ((FLAGS_prefix_size == 0) && + (options.memtable_factory->IsInstanceOf("prefix_hash") || + options.memtable_factory->IsInstanceOf("hash_linkedlist"))) { + fprintf(stderr, + "prefix_size should be non-zero if PrefixHash or " + "HashLinkedList memtablerep is used\n"); + exit(1); + } + if (FLAGS_use_plain_table) { +#ifndef ROCKSDB_LITE + if (!options.memtable_factory->IsInstanceOf("prefix_hash") && + !options.memtable_factory->IsInstanceOf("hash_linkedlist")) { + fprintf(stderr, "Warning: plain table is used with %s\n", + options.memtable_factory->Name()); + } + + int bloom_bits_per_key = FLAGS_bloom_bits; + if (bloom_bits_per_key < 0) { + bloom_bits_per_key = PlainTableOptions().bloom_bits_per_key; + } + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = FLAGS_key_size; + plain_table_options.bloom_bits_per_key = bloom_bits_per_key; + plain_table_options.hash_table_ratio = 0.75; + options.table_factory = std::shared_ptr<TableFactory>( + NewPlainTableFactory(plain_table_options)); +#else + fprintf(stderr, "Plain table is not supported in lite mode\n"); + exit(1); +#endif // ROCKSDB_LITE + } else if (FLAGS_use_cuckoo_table) { +#ifndef ROCKSDB_LITE + if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) { + fprintf(stderr, "Invalid cuckoo_hash_ratio\n"); + exit(1); + } + + if (!FLAGS_mmap_read) { + fprintf(stderr, "cuckoo table format requires mmap read to operate\n"); + exit(1); + } + + ROCKSDB_NAMESPACE::CuckooTableOptions table_options; + table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio; + table_options.identity_as_first_hash = FLAGS_identity_as_first_hash; + options.table_factory = + std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options)); +#else + fprintf(stderr, "Cuckoo table is not supported in lite mode\n"); + exit(1); +#endif // ROCKSDB_LITE + } else { + BlockBasedTableOptions block_based_options; + block_based_options.checksum = + static_cast<ChecksumType>(FLAGS_checksum_type); + if (FLAGS_use_hash_search) { + if (FLAGS_prefix_size == 0) { + fprintf(stderr, + "prefix_size not assigned when enable use_hash_search \n"); + exit(1); + } + block_based_options.index_type = BlockBasedTableOptions::kHashSearch; + } else { + block_based_options.index_type = BlockBasedTableOptions::kBinarySearch; + } + if (FLAGS_partition_index_and_filters || FLAGS_partition_index) { + if (FLAGS_index_with_first_key) { + fprintf(stderr, + "--index_with_first_key is not compatible with" + " partition index."); + } + if (FLAGS_use_hash_search) { + fprintf(stderr, + "use_hash_search is incompatible with " + "partition index and is ignored"); + } + block_based_options.index_type = + BlockBasedTableOptions::kTwoLevelIndexSearch; + block_based_options.metadata_block_size = FLAGS_metadata_block_size; + if (FLAGS_partition_index_and_filters) { + block_based_options.partition_filters = true; + } + } else if (FLAGS_index_with_first_key) { + block_based_options.index_type = + BlockBasedTableOptions::kBinarySearchWithFirstKey; + } + BlockBasedTableOptions::IndexShorteningMode index_shortening = + block_based_options.index_shortening; + switch (FLAGS_index_shortening_mode) { + case 0: + index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + break; + case 1: + index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators; + break; + case 2: + index_shortening = BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor; + break; + default: + fprintf(stderr, "Unknown key shortening mode\n"); + } + block_based_options.optimize_filters_for_memory = + FLAGS_optimize_filters_for_memory; + block_based_options.index_shortening = index_shortening; + if (cache_ == nullptr) { + block_based_options.no_block_cache = true; + } + block_based_options.cache_index_and_filter_blocks = + FLAGS_cache_index_and_filter_blocks; + block_based_options.pin_l0_filter_and_index_blocks_in_cache = + FLAGS_pin_l0_filter_and_index_blocks_in_cache; + block_based_options.pin_top_level_index_and_filter = + FLAGS_pin_top_level_index_and_filter; + if (FLAGS_cache_high_pri_pool_ratio > 1e-6) { // > 0.0 + eps + block_based_options.cache_index_and_filter_blocks_with_high_priority = + true; + } + if (FLAGS_cache_high_pri_pool_ratio + FLAGS_cache_low_pri_pool_ratio > + 1.0) { + fprintf(stderr, + "Sum of high_pri_pool_ratio and low_pri_pool_ratio " + "cannot exceed 1.0.\n"); + } + block_based_options.block_cache = cache_; + block_based_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kCompressionDictionaryBuildingBuffer, + {/*.charged = */ FLAGS_charge_compression_dictionary_building_buffer + ? CacheEntryRoleOptions::Decision::kEnabled + : CacheEntryRoleOptions::Decision::kDisabled}}); + block_based_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kFilterConstruction, + {/*.charged = */ FLAGS_charge_filter_construction + ? CacheEntryRoleOptions::Decision::kEnabled + : CacheEntryRoleOptions::Decision::kDisabled}}); + block_based_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kBlockBasedTableReader, + {/*.charged = */ FLAGS_charge_table_reader + ? CacheEntryRoleOptions::Decision::kEnabled + : CacheEntryRoleOptions::Decision::kDisabled}}); + block_based_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kFileMetadata, + {/*.charged = */ FLAGS_charge_file_metadata + ? CacheEntryRoleOptions::Decision::kEnabled + : CacheEntryRoleOptions::Decision::kDisabled}}); + block_based_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kBlobCache, + {/*.charged = */ FLAGS_charge_blob_cache + ? CacheEntryRoleOptions::Decision::kEnabled + : CacheEntryRoleOptions::Decision::kDisabled}}); + block_based_options.block_cache_compressed = compressed_cache_; + block_based_options.block_size = FLAGS_block_size; + block_based_options.block_restart_interval = FLAGS_block_restart_interval; + block_based_options.index_block_restart_interval = + FLAGS_index_block_restart_interval; + block_based_options.format_version = + static_cast<uint32_t>(FLAGS_format_version); + block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit; + block_based_options.enable_index_compression = + FLAGS_enable_index_compression; + block_based_options.block_align = FLAGS_block_align; + block_based_options.whole_key_filtering = FLAGS_whole_key_filtering; + block_based_options.max_auto_readahead_size = + FLAGS_max_auto_readahead_size; + block_based_options.initial_auto_readahead_size = + FLAGS_initial_auto_readahead_size; + block_based_options.num_file_reads_for_auto_readahead = + FLAGS_num_file_reads_for_auto_readahead; + BlockBasedTableOptions::PrepopulateBlockCache prepopulate_block_cache = + block_based_options.prepopulate_block_cache; + switch (FLAGS_prepopulate_block_cache) { + case 0: + prepopulate_block_cache = + BlockBasedTableOptions::PrepopulateBlockCache::kDisable; + break; + case 1: + prepopulate_block_cache = + BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly; + break; + default: + fprintf(stderr, "Unknown prepopulate block cache mode\n"); + } + block_based_options.prepopulate_block_cache = prepopulate_block_cache; + if (FLAGS_use_data_block_hash_index) { + block_based_options.data_block_index_type = + ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash; + } else { + block_based_options.data_block_index_type = + ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinarySearch; + } + block_based_options.data_block_hash_table_util_ratio = + FLAGS_data_block_hash_table_util_ratio; + if (FLAGS_read_cache_path != "") { +#ifndef ROCKSDB_LITE + Status rc_status; + + // Read cache need to be provided with a the Logger, we will put all + // reac cache logs in the read cache path in a file named rc_LOG + rc_status = FLAGS_env->CreateDirIfMissing(FLAGS_read_cache_path); + std::shared_ptr<Logger> read_cache_logger; + if (rc_status.ok()) { + rc_status = FLAGS_env->NewLogger(FLAGS_read_cache_path + "/rc_LOG", + &read_cache_logger); + } + + if (rc_status.ok()) { + PersistentCacheConfig rc_cfg(FLAGS_env, FLAGS_read_cache_path, + FLAGS_read_cache_size, + read_cache_logger); + + rc_cfg.enable_direct_reads = FLAGS_read_cache_direct_read; + rc_cfg.enable_direct_writes = FLAGS_read_cache_direct_write; + rc_cfg.writer_qdepth = 4; + rc_cfg.writer_dispatch_size = 4 * 1024; + + auto pcache = std::make_shared<BlockCacheTier>(rc_cfg); + block_based_options.persistent_cache = pcache; + rc_status = pcache->Open(); + } + + if (!rc_status.ok()) { + fprintf(stderr, "Error initializing read cache, %s\n", + rc_status.ToString().c_str()); + exit(1); + } +#else + fprintf(stderr, "Read cache is not supported in LITE\n"); + exit(1); + +#endif + } + + if (FLAGS_use_blob_cache) { + if (FLAGS_use_shared_block_and_blob_cache) { + options.blob_cache = cache_; + } else { + if (FLAGS_blob_cache_size > 0) { + LRUCacheOptions co; + co.capacity = FLAGS_blob_cache_size; + co.num_shard_bits = FLAGS_blob_cache_numshardbits; + co.memory_allocator = GetCacheAllocator(); + + options.blob_cache = NewLRUCache(co); + } else { + fprintf( + stderr, + "Unable to create a standalone blob cache if blob_cache_size " + "<= 0.\n"); + exit(1); + } + } + switch (FLAGS_prepopulate_blob_cache) { + case 0: + options.prepopulate_blob_cache = PrepopulateBlobCache::kDisable; + break; + case 1: + options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly; + break; + default: + fprintf(stderr, "Unknown prepopulate blob cache mode\n"); + exit(1); + } + + fprintf(stdout, + "Integrated BlobDB: blob cache enabled" + ", block and blob caches shared: %d", + FLAGS_use_shared_block_and_blob_cache); + if (!FLAGS_use_shared_block_and_blob_cache) { + fprintf(stdout, + ", blob cache size %" PRIu64 + ", blob cache num shard bits: %d", + FLAGS_blob_cache_size, FLAGS_blob_cache_numshardbits); + } + fprintf(stdout, ", blob cache prepopulated: %d\n", + FLAGS_prepopulate_blob_cache); + } else { + fprintf(stdout, "Integrated BlobDB: blob cache disabled\n"); + } + + options.table_factory.reset( + NewBlockBasedTableFactory(block_based_options)); + } + if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) { + if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() != + static_cast<unsigned int>(FLAGS_num_levels)) { + fprintf(stderr, "Insufficient number of fanouts specified %d\n", + static_cast<int>( + FLAGS_max_bytes_for_level_multiplier_additional_v.size())); + exit(1); + } + options.max_bytes_for_level_multiplier_additional = + FLAGS_max_bytes_for_level_multiplier_additional_v; + } + options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger; + options.level0_file_num_compaction_trigger = + FLAGS_level0_file_num_compaction_trigger; + options.level0_slowdown_writes_trigger = + FLAGS_level0_slowdown_writes_trigger; + options.compression = FLAGS_compression_type_e; + if (FLAGS_simulate_hybrid_fs_file != "") { + options.bottommost_temperature = Temperature::kWarm; + } + options.preclude_last_level_data_seconds = + FLAGS_preclude_last_level_data_seconds; + options.preserve_internal_time_seconds = + FLAGS_preserve_internal_time_seconds; + options.sample_for_compression = FLAGS_sample_for_compression; + options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds; + options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB; + options.max_total_wal_size = FLAGS_max_total_wal_size; + + if (FLAGS_min_level_to_compress >= 0) { + assert(FLAGS_min_level_to_compress <= FLAGS_num_levels); + options.compression_per_level.resize(FLAGS_num_levels); + for (int i = 0; i < FLAGS_min_level_to_compress; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = FLAGS_min_level_to_compress; i < FLAGS_num_levels; i++) { + options.compression_per_level[i] = FLAGS_compression_type_e; + } + } + options.soft_pending_compaction_bytes_limit = + FLAGS_soft_pending_compaction_bytes_limit; + options.hard_pending_compaction_bytes_limit = + FLAGS_hard_pending_compaction_bytes_limit; + options.delayed_write_rate = FLAGS_delayed_write_rate; + options.allow_concurrent_memtable_write = + FLAGS_allow_concurrent_memtable_write; + options.experimental_mempurge_threshold = + FLAGS_experimental_mempurge_threshold; + options.inplace_update_support = FLAGS_inplace_update_support; + options.inplace_update_num_locks = FLAGS_inplace_update_num_locks; + options.enable_write_thread_adaptive_yield = + FLAGS_enable_write_thread_adaptive_yield; + options.enable_pipelined_write = FLAGS_enable_pipelined_write; + options.unordered_write = FLAGS_unordered_write; + options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec; + options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec; + options.table_cache_numshardbits = FLAGS_table_cache_numshardbits; + options.max_compaction_bytes = FLAGS_max_compaction_bytes; + options.disable_auto_compactions = FLAGS_disable_auto_compactions; + options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits; + options.paranoid_checks = FLAGS_paranoid_checks; + options.force_consistency_checks = FLAGS_force_consistency_checks; + options.check_flush_compaction_key_order = + FLAGS_check_flush_compaction_key_order; + options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds; + options.ttl = FLAGS_ttl_seconds; + // fill storage options + options.advise_random_on_open = FLAGS_advise_random_on_open; + options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e; + options.use_adaptive_mutex = FLAGS_use_adaptive_mutex; + options.bytes_per_sync = FLAGS_bytes_per_sync; + options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync; + + // merge operator options + if (!FLAGS_merge_operator.empty()) { + s = MergeOperator::CreateFromString(config_options, FLAGS_merge_operator, + &options.merge_operator); + if (!s.ok()) { + fprintf(stderr, "invalid merge operator[%s]: %s\n", + FLAGS_merge_operator.c_str(), s.ToString().c_str()); + exit(1); + } + } + options.max_successive_merges = FLAGS_max_successive_merges; + options.report_bg_io_stats = FLAGS_report_bg_io_stats; + + // set universal style compaction configurations, if applicable + if (FLAGS_universal_size_ratio != 0) { + options.compaction_options_universal.size_ratio = + FLAGS_universal_size_ratio; + } + if (FLAGS_universal_min_merge_width != 0) { + options.compaction_options_universal.min_merge_width = + FLAGS_universal_min_merge_width; + } + if (FLAGS_universal_max_merge_width != 0) { + options.compaction_options_universal.max_merge_width = + FLAGS_universal_max_merge_width; + } + if (FLAGS_universal_max_size_amplification_percent != 0) { + options.compaction_options_universal.max_size_amplification_percent = + FLAGS_universal_max_size_amplification_percent; + } + if (FLAGS_universal_compression_size_percent != -1) { + options.compaction_options_universal.compression_size_percent = + FLAGS_universal_compression_size_percent; + } + options.compaction_options_universal.allow_trivial_move = + FLAGS_universal_allow_trivial_move; + options.compaction_options_universal.incremental = + FLAGS_universal_incremental; + if (FLAGS_thread_status_per_interval > 0) { + options.enable_thread_tracking = true; + } + + if (FLAGS_user_timestamp_size > 0) { + if (FLAGS_user_timestamp_size != 8) { + fprintf(stderr, "Only 64 bits timestamps are supported.\n"); + exit(1); + } + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + } + + options.allow_data_in_errors = FLAGS_allow_data_in_errors; + options.track_and_verify_wals_in_manifest = + FLAGS_track_and_verify_wals_in_manifest; + + // Integrated BlobDB + options.enable_blob_files = FLAGS_enable_blob_files; + options.min_blob_size = FLAGS_min_blob_size; + options.blob_file_size = FLAGS_blob_file_size; + options.blob_compression_type = + StringToCompressionType(FLAGS_blob_compression_type.c_str()); + options.enable_blob_garbage_collection = + FLAGS_enable_blob_garbage_collection; + options.blob_garbage_collection_age_cutoff = + FLAGS_blob_garbage_collection_age_cutoff; + options.blob_garbage_collection_force_threshold = + FLAGS_blob_garbage_collection_force_threshold; + options.blob_compaction_readahead_size = + FLAGS_blob_compaction_readahead_size; + options.blob_file_starting_level = FLAGS_blob_file_starting_level; + +#ifndef ROCKSDB_LITE + if (FLAGS_readonly && FLAGS_transaction_db) { + fprintf(stderr, "Cannot use readonly flag with transaction_db\n"); + exit(1); + } + if (FLAGS_use_secondary_db && + (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) { + fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n"); + exit(1); + } +#endif // ROCKSDB_LITE + options.memtable_protection_bytes_per_key = + FLAGS_memtable_protection_bytes_per_key; + } + + void InitializeOptionsGeneral(Options* opts) { + // Be careful about what is set here to avoid accidentally overwriting + // settings already configured by OPTIONS file. Only configure settings that + // are needed for the benchmark to run, settings for shared objects that + // were not configured already, settings that require dynamically invoking + // APIs, and settings for the benchmark itself. + Options& options = *opts; + + // Always set these since they are harmless when not needed and prevent + // a guaranteed failure when they are needed. + options.create_missing_column_families = true; + options.create_if_missing = true; + + if (options.statistics == nullptr) { + options.statistics = dbstats; + } + + auto table_options = + options.table_factory->GetOptions<BlockBasedTableOptions>(); + if (table_options != nullptr) { + if (FLAGS_cache_size > 0) { + // This violates this function's rules on when to set options. But we + // have to do it because the case of unconfigured block cache in OPTIONS + // file is indistinguishable (it is sanitized to 8MB by this point, not + // nullptr), and our regression tests assume this will be the shared + // block cache, even with OPTIONS file provided. + table_options->block_cache = cache_; + } + if (table_options->filter_policy == nullptr) { + if (FLAGS_bloom_bits < 0) { + table_options->filter_policy = BlockBasedTableOptions().filter_policy; + } else if (FLAGS_bloom_bits == 0) { + table_options->filter_policy.reset(); + } else { + table_options->filter_policy.reset( + FLAGS_use_ribbon_filter ? NewRibbonFilterPolicy(FLAGS_bloom_bits) + : NewBloomFilterPolicy(FLAGS_bloom_bits)); + } + } + } + + if (options.row_cache == nullptr) { + if (FLAGS_row_cache_size) { + if (FLAGS_cache_numshardbits >= 1) { + options.row_cache = + NewLRUCache(FLAGS_row_cache_size, FLAGS_cache_numshardbits); + } else { + options.row_cache = NewLRUCache(FLAGS_row_cache_size); + } + } + } + + if (options.env == Env::Default()) { + options.env = FLAGS_env; + } + if (FLAGS_enable_io_prio) { + options.env->LowerThreadPoolIOPriority(Env::LOW); + options.env->LowerThreadPoolIOPriority(Env::HIGH); + } + if (FLAGS_enable_cpu_prio) { + options.env->LowerThreadPoolCPUPriority(Env::LOW); + options.env->LowerThreadPoolCPUPriority(Env::HIGH); + } + + if (FLAGS_sine_write_rate) { + FLAGS_benchmark_write_rate_limit = static_cast<uint64_t>(SineRate(0)); + } + + if (options.rate_limiter == nullptr) { + if (FLAGS_rate_limiter_bytes_per_sec > 0) { + options.rate_limiter.reset(NewGenericRateLimiter( + FLAGS_rate_limiter_bytes_per_sec, + FLAGS_rate_limiter_refill_period_us, 10 /* fairness */, + // TODO: replace this with a more general FLAG for deciding + // RateLimiter::Mode as now we also rate-limit foreground reads e.g, + // Get()/MultiGet() + FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly + : RateLimiter::Mode::kWritesOnly, + FLAGS_rate_limiter_auto_tuned)); + } + } + + options.listeners.emplace_back(listener_); + + if (options.file_checksum_gen_factory == nullptr) { + if (FLAGS_file_checksum) { + options.file_checksum_gen_factory.reset( + new FileChecksumGenCrc32cFactory()); + } + } + + if (FLAGS_num_multi_db <= 1) { + OpenDb(options, FLAGS_db, &db_); + } else { + multi_dbs_.clear(); + multi_dbs_.resize(FLAGS_num_multi_db); + auto wal_dir = options.wal_dir; + for (int i = 0; i < FLAGS_num_multi_db; i++) { + if (!wal_dir.empty()) { + options.wal_dir = GetPathForMultiple(wal_dir, i); + } + OpenDb(options, GetPathForMultiple(FLAGS_db, i), &multi_dbs_[i]); + } + options.wal_dir = wal_dir; + } + + // KeepFilter is a noop filter, this can be used to test compaction filter + if (options.compaction_filter == nullptr) { + if (FLAGS_use_keep_filter) { + options.compaction_filter = new KeepFilter(); + fprintf(stdout, "A noop compaction filter is used\n"); + } + } + + if (FLAGS_use_existing_keys) { + // Only work on single database + assert(db_.db != nullptr); + ReadOptions read_opts; // before read_options_ initialized + read_opts.total_order_seek = true; + Iterator* iter = db_.db->NewIterator(read_opts); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + keys_.emplace_back(iter->key().ToString()); + } + delete iter; + FLAGS_num = keys_.size(); + } + } + + void Open(Options* opts) { + if (!InitializeOptionsFromFile(opts)) { + InitializeOptionsFromFlags(opts); + } + + InitializeOptionsGeneral(opts); + } + + void OpenDb(Options options, const std::string& db_name, + DBWithColumnFamilies* db) { + uint64_t open_start = FLAGS_report_open_timing ? FLAGS_env->NowNanos() : 0; + Status s; + // Open with column families if necessary. + if (FLAGS_num_column_families > 1) { + size_t num_hot = FLAGS_num_column_families; + if (FLAGS_num_hot_column_families > 0 && + FLAGS_num_hot_column_families < FLAGS_num_column_families) { + num_hot = FLAGS_num_hot_column_families; + } else { + FLAGS_num_hot_column_families = FLAGS_num_column_families; + } + std::vector<ColumnFamilyDescriptor> column_families; + for (size_t i = 0; i < num_hot; i++) { + column_families.push_back(ColumnFamilyDescriptor( + ColumnFamilyName(i), ColumnFamilyOptions(options))); + } + std::vector<int> cfh_idx_to_prob; + if (!FLAGS_column_family_distribution.empty()) { + std::stringstream cf_prob_stream(FLAGS_column_family_distribution); + std::string cf_prob; + int sum = 0; + while (std::getline(cf_prob_stream, cf_prob, ',')) { + cfh_idx_to_prob.push_back(std::stoi(cf_prob)); + sum += cfh_idx_to_prob.back(); + } + if (sum != 100) { + fprintf(stderr, "column_family_distribution items must sum to 100\n"); + exit(1); + } + if (cfh_idx_to_prob.size() != num_hot) { + fprintf(stderr, + "got %" ROCKSDB_PRIszt + " column_family_distribution items; expected " + "%" ROCKSDB_PRIszt "\n", + cfh_idx_to_prob.size(), num_hot); + exit(1); + } + } +#ifndef ROCKSDB_LITE + if (FLAGS_readonly) { + s = DB::OpenForReadOnly(options, db_name, column_families, &db->cfh, + &db->db); + } else if (FLAGS_optimistic_transaction_db) { + s = OptimisticTransactionDB::Open(options, db_name, column_families, + &db->cfh, &db->opt_txn_db); + if (s.ok()) { + db->db = db->opt_txn_db->GetBaseDB(); + } + } else if (FLAGS_transaction_db) { + TransactionDB* ptr; + TransactionDBOptions txn_db_options; + if (options.unordered_write) { + options.two_write_queues = true; + txn_db_options.skip_concurrency_control = true; + txn_db_options.write_policy = WRITE_PREPARED; + } + s = TransactionDB::Open(options, txn_db_options, db_name, + column_families, &db->cfh, &ptr); + if (s.ok()) { + db->db = ptr; + } + } else { + s = DB::Open(options, db_name, column_families, &db->cfh, &db->db); + } +#else + s = DB::Open(options, db_name, column_families, &db->cfh, &db->db); +#endif // ROCKSDB_LITE + db->cfh.resize(FLAGS_num_column_families); + db->num_created = num_hot; + db->num_hot = num_hot; + db->cfh_idx_to_prob = std::move(cfh_idx_to_prob); +#ifndef ROCKSDB_LITE + } else if (FLAGS_readonly) { + s = DB::OpenForReadOnly(options, db_name, &db->db); + } else if (FLAGS_optimistic_transaction_db) { + s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db); + if (s.ok()) { + db->db = db->opt_txn_db->GetBaseDB(); + } + } else if (FLAGS_transaction_db) { + TransactionDB* ptr = nullptr; + TransactionDBOptions txn_db_options; + if (options.unordered_write) { + options.two_write_queues = true; + txn_db_options.skip_concurrency_control = true; + txn_db_options.write_policy = WRITE_PREPARED; + } + s = CreateLoggerFromOptions(db_name, options, &options.info_log); + if (s.ok()) { + s = TransactionDB::Open(options, txn_db_options, db_name, &ptr); + } + if (s.ok()) { + db->db = ptr; + } + } else if (FLAGS_use_blob_db) { + // Stacked BlobDB + blob_db::BlobDBOptions blob_db_options; + blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc; + blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff; + blob_db_options.is_fifo = FLAGS_blob_db_is_fifo; + blob_db_options.max_db_size = FLAGS_blob_db_max_db_size; + blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs; + blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size; + blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync; + blob_db_options.blob_file_size = FLAGS_blob_db_file_size; + blob_db_options.compression = FLAGS_blob_db_compression_type_e; + blob_db::BlobDB* ptr = nullptr; + s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr); + if (s.ok()) { + db->db = ptr; + } + } else if (FLAGS_use_secondary_db) { + if (FLAGS_secondary_path.empty()) { + std::string default_secondary_path; + FLAGS_env->GetTestDirectory(&default_secondary_path); + default_secondary_path += "/dbbench_secondary"; + FLAGS_secondary_path = default_secondary_path; + } + s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db); + if (s.ok() && FLAGS_secondary_update_interval > 0) { + secondary_update_thread_.reset(new port::Thread( + [this](int interval, DBWithColumnFamilies* _db) { + while (0 == secondary_update_stopped_.load( + std::memory_order_relaxed)) { + Status secondary_update_status = + _db->db->TryCatchUpWithPrimary(); + if (!secondary_update_status.ok()) { + fprintf(stderr, "Failed to catch up with primary: %s\n", + secondary_update_status.ToString().c_str()); + break; + } + ++secondary_db_updates_; + FLAGS_env->SleepForMicroseconds(interval * 1000000); + } + }, + FLAGS_secondary_update_interval, db)); + } +#endif // ROCKSDB_LITE + } else { + s = DB::Open(options, db_name, &db->db); + } + if (FLAGS_report_open_timing) { + std::cout << "OpenDb: " + << (FLAGS_env->NowNanos() - open_start) / 1000000.0 + << " milliseconds\n"; + } + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + } + + enum WriteMode { RANDOM, SEQUENTIAL, UNIQUE_RANDOM }; + + void WriteSeqDeterministic(ThreadState* thread) { + DoDeterministicCompact(thread, open_options_.compaction_style, SEQUENTIAL); + } + + void WriteUniqueRandomDeterministic(ThreadState* thread) { + DoDeterministicCompact(thread, open_options_.compaction_style, + UNIQUE_RANDOM); + } + + void WriteSeq(ThreadState* thread) { DoWrite(thread, SEQUENTIAL); } + + void WriteRandom(ThreadState* thread) { DoWrite(thread, RANDOM); } + + void WriteUniqueRandom(ThreadState* thread) { + DoWrite(thread, UNIQUE_RANDOM); + } + + class KeyGenerator { + public: + KeyGenerator(Random64* rand, WriteMode mode, uint64_t num, + uint64_t /*num_per_set*/ = 64 * 1024) + : rand_(rand), mode_(mode), num_(num), next_(0) { + if (mode_ == UNIQUE_RANDOM) { + // NOTE: if memory consumption of this approach becomes a concern, + // we can either break it into pieces and only random shuffle a section + // each time. Alternatively, use a bit map implementation + // (https://reviews.facebook.net/differential/diff/54627/) + values_.resize(num_); + for (uint64_t i = 0; i < num_; ++i) { + values_[i] = i; + } + RandomShuffle(values_.begin(), values_.end(), + static_cast<uint32_t>(seed_base)); + } + } + + uint64_t Next() { + switch (mode_) { + case SEQUENTIAL: + return next_++; + case RANDOM: + return rand_->Next() % num_; + case UNIQUE_RANDOM: + assert(next_ < num_); + return values_[next_++]; + } + assert(false); + return std::numeric_limits<uint64_t>::max(); + } + + // Only available for UNIQUE_RANDOM mode. + uint64_t Fetch(uint64_t index) { + assert(mode_ == UNIQUE_RANDOM); + assert(index < values_.size()); + return values_[index]; + } + + private: + Random64* rand_; + WriteMode mode_; + const uint64_t num_; + uint64_t next_; + std::vector<uint64_t> values_; + }; + + DB* SelectDB(ThreadState* thread) { return SelectDBWithCfh(thread)->db; } + + DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) { + return SelectDBWithCfh(thread->rand.Next()); + } + + DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) { + if (db_.db != nullptr) { + return &db_; + } else { + return &multi_dbs_[rand_int % multi_dbs_.size()]; + } + } + + double SineRate(double x) { + return FLAGS_sine_a * sin((FLAGS_sine_b * x) + FLAGS_sine_c) + FLAGS_sine_d; + } + + void DoWrite(ThreadState* thread, WriteMode write_mode) { + const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0; + const int64_t num_ops = writes_ == 0 ? num_ : writes_; + + size_t num_key_gens = 1; + if (db_.db == nullptr) { + num_key_gens = multi_dbs_.size(); + } + std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens); + int64_t max_ops = num_ops * num_key_gens; + int64_t ops_per_stage = max_ops; + if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) { + ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families / + FLAGS_num_hot_column_families) + + 1; + } + + Duration duration(test_duration, max_ops, ops_per_stage); + const uint64_t num_per_key_gen = num_ + max_num_range_tombstones_; + for (size_t i = 0; i < num_key_gens; i++) { + key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode, + num_per_key_gen, ops_per_stage)); + } + + if (num_ != FLAGS_num) { + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_); + thread->stats.AddMessage(msg); + } + + RandomGenerator gen; + WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0, + FLAGS_write_batch_protection_bytes_per_key, + user_timestamp_size_); + Status s; + int64_t bytes = 0; + + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + std::unique_ptr<const char[]> begin_key_guard; + Slice begin_key = AllocateKey(&begin_key_guard); + std::unique_ptr<const char[]> end_key_guard; + Slice end_key = AllocateKey(&end_key_guard); + double p = 0.0; + uint64_t num_overwrites = 0, num_unique_keys = 0, num_selective_deletes = 0; + // If user set overwrite_probability flag, + // check if value is in [0.0,1.0]. + if (FLAGS_overwrite_probability > 0.0) { + p = FLAGS_overwrite_probability > 1.0 ? 1.0 : FLAGS_overwrite_probability; + // If overwrite set by user, and UNIQUE_RANDOM mode on, + // the overwrite_window_size must be > 0. + if (write_mode == UNIQUE_RANDOM && FLAGS_overwrite_window_size == 0) { + fprintf(stderr, + "Overwrite_window_size must be strictly greater than 0.\n"); + ErrorExit(); + } + } + + // Default_random_engine provides slightly + // improved throughput over mt19937. + std::default_random_engine overwrite_gen{ + static_cast<unsigned int>(seed_base)}; + std::bernoulli_distribution overwrite_decider(p); + + // Inserted key window is filled with the last N + // keys previously inserted into the DB (with + // N=FLAGS_overwrite_window_size). + // We use a deque struct because: + // - random access is O(1) + // - insertion/removal at beginning/end is also O(1). + std::deque<int64_t> inserted_key_window; + Random64 reservoir_id_gen(seed_base); + + // --- Variables used in disposable/persistent keys simulation: + // The following variables are used when + // disposable_entries_batch_size is >0. We simualte a workload + // where the following sequence is repeated multiple times: + // "A set of keys S1 is inserted ('disposable entries'), then after + // some delay another set of keys S2 is inserted ('persistent entries') + // and the first set of keys S1 is deleted. S2 artificially represents + // the insertion of hypothetical results from some undefined computation + // done on the first set of keys S1. The next sequence can start as soon + // as the last disposable entry in the set S1 of this sequence is + // inserted, if the delay is non negligible" + bool skip_for_loop = false, is_disposable_entry = true; + std::vector<uint64_t> disposable_entries_index(num_key_gens, 0); + std::vector<uint64_t> persistent_ent_and_del_index(num_key_gens, 0); + const uint64_t kNumDispAndPersEntries = + FLAGS_disposable_entries_batch_size + + FLAGS_persistent_entries_batch_size; + if (kNumDispAndPersEntries > 0) { + if ((write_mode != UNIQUE_RANDOM) || (writes_per_range_tombstone_ > 0) || + (p > 0.0)) { + fprintf( + stderr, + "Disposable/persistent deletes are not compatible with overwrites " + "and DeleteRanges; and are only supported in filluniquerandom.\n"); + ErrorExit(); + } + if (FLAGS_disposable_entries_value_size < 0 || + FLAGS_persistent_entries_value_size < 0) { + fprintf( + stderr, + "disposable_entries_value_size and persistent_entries_value_size" + "have to be positive.\n"); + ErrorExit(); + } + } + Random rnd_disposable_entry(static_cast<uint32_t>(seed_base)); + std::string random_value; + // Queue that stores scheduled timestamp of disposable entries deletes, + // along with starting index of disposable entry keys to delete. + std::vector<std::queue<std::pair<uint64_t, uint64_t>>> disposable_entries_q( + num_key_gens); + // --- End of variables used in disposable/persistent keys simulation. + + std::vector<std::unique_ptr<const char[]>> expanded_key_guards; + std::vector<Slice> expanded_keys; + if (FLAGS_expand_range_tombstones) { + expanded_key_guards.resize(range_tombstone_width_); + for (auto& expanded_key_guard : expanded_key_guards) { + expanded_keys.emplace_back(AllocateKey(&expanded_key_guard)); + } + } + + std::unique_ptr<char[]> ts_guard; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + } + + int64_t stage = 0; + int64_t num_written = 0; + int64_t next_seq_db_at = num_ops; + size_t id = 0; + int64_t num_range_deletions = 0; + + while ((num_per_key_gen != 0) && !duration.Done(entries_per_batch_)) { + if (duration.GetStage() != stage) { + stage = duration.GetStage(); + if (db_.db != nullptr) { + db_.CreateNewCf(open_options_, stage); + } else { + for (auto& db : multi_dbs_) { + db.CreateNewCf(open_options_, stage); + } + } + } + + if (write_mode != SEQUENTIAL) { + id = thread->rand.Next() % num_key_gens; + } else { + // When doing a sequential load with multiple databases, load them in + // order rather than all at the same time to avoid: + // 1) long delays between flushing memtables + // 2) flushing memtables for all of them at the same point in time + // 3) not putting the same number of keys in each database + if (num_written >= next_seq_db_at) { + next_seq_db_at += num_ops; + id++; + if (id >= num_key_gens) { + fprintf(stderr, "Logic error. Filled all databases\n"); + ErrorExit(); + } + } + } + DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id); + + batch.Clear(); + int64_t batch_bytes = 0; + + for (int64_t j = 0; j < entries_per_batch_; j++) { + int64_t rand_num = 0; + if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) { + if ((inserted_key_window.size() > 0) && + overwrite_decider(overwrite_gen)) { + num_overwrites++; + rand_num = inserted_key_window[reservoir_id_gen.Next() % + inserted_key_window.size()]; + } else { + num_unique_keys++; + rand_num = key_gens[id]->Next(); + if (inserted_key_window.size() < FLAGS_overwrite_window_size) { + inserted_key_window.push_back(rand_num); + } else { + inserted_key_window.pop_front(); + inserted_key_window.push_back(rand_num); + } + } + } else if (kNumDispAndPersEntries > 0) { + // Check if queue is non-empty and if we need to insert + // 'persistent' KV entries (KV entries that are never deleted) + // and delete disposable entries previously inserted. + if (!disposable_entries_q[id].empty() && + (disposable_entries_q[id].front().first < + FLAGS_env->NowMicros())) { + // If we need to perform a "merge op" pattern, + // we first write all the persistent KV entries not targeted + // by deletes, and then we write the disposable entries deletes. + if (persistent_ent_and_del_index[id] < + FLAGS_persistent_entries_batch_size) { + // Generate key to insert. + rand_num = + key_gens[id]->Fetch(disposable_entries_q[id].front().second + + FLAGS_disposable_entries_batch_size + + persistent_ent_and_del_index[id]); + persistent_ent_and_del_index[id]++; + is_disposable_entry = false; + skip_for_loop = false; + } else if (persistent_ent_and_del_index[id] < + kNumDispAndPersEntries) { + // Find key of the entry to delete. + rand_num = + key_gens[id]->Fetch(disposable_entries_q[id].front().second + + (persistent_ent_and_del_index[id] - + FLAGS_persistent_entries_batch_size)); + persistent_ent_and_del_index[id]++; + GenerateKeyFromInt(rand_num, FLAGS_num, &key); + // For the delete operation, everything happens here and we + // skip the rest of the for-loop, which is designed for + // inserts. + if (FLAGS_num_column_families <= 1) { + batch.Delete(key); + } else { + // We use same rand_num as seed for key and column family so + // that we can deterministically find the cfh corresponding to a + // particular key while reading the key. + batch.Delete(db_with_cfh->GetCfh(rand_num), key); + } + // A delete only includes Key+Timestamp (no value). + batch_bytes += key_size_ + user_timestamp_size_; + bytes += key_size_ + user_timestamp_size_; + num_selective_deletes++; + // Skip rest of the for-loop (j=0, j<entries_per_batch_,j++). + skip_for_loop = true; + } else { + assert(false); // should never reach this point. + } + // If disposable_entries_q needs to be updated (ie: when a selective + // insert+delete was successfully completed, pop the job out of the + // queue). + if (!disposable_entries_q[id].empty() && + (disposable_entries_q[id].front().first < + FLAGS_env->NowMicros()) && + persistent_ent_and_del_index[id] == kNumDispAndPersEntries) { + disposable_entries_q[id].pop(); + persistent_ent_and_del_index[id] = 0; + } + + // If we are deleting disposable entries, skip the rest of the + // for-loop since there is no key-value inserts at this moment in + // time. + if (skip_for_loop) { + continue; + } + + } + // If no job is in the queue, then we keep inserting disposable KV + // entries that will be deleted later by a series of deletes. + else { + rand_num = key_gens[id]->Fetch(disposable_entries_index[id]); + disposable_entries_index[id]++; + is_disposable_entry = true; + if ((disposable_entries_index[id] % + FLAGS_disposable_entries_batch_size) == 0) { + // Skip the persistent KV entries inserts for now + disposable_entries_index[id] += + FLAGS_persistent_entries_batch_size; + } + } + } else { + rand_num = key_gens[id]->Next(); + } + GenerateKeyFromInt(rand_num, FLAGS_num, &key); + Slice val; + if (kNumDispAndPersEntries > 0) { + random_value = rnd_disposable_entry.RandomString( + is_disposable_entry ? FLAGS_disposable_entries_value_size + : FLAGS_persistent_entries_value_size); + val = Slice(random_value); + num_unique_keys++; + } else { + val = gen.Generate(); + } + if (use_blob_db_) { +#ifndef ROCKSDB_LITE + // Stacked BlobDB + blob_db::BlobDB* blobdb = + static_cast<blob_db::BlobDB*>(db_with_cfh->db); + if (FLAGS_blob_db_max_ttl_range > 0) { + int ttl = rand() % FLAGS_blob_db_max_ttl_range; + s = blobdb->PutWithTTL(write_options_, key, val, ttl); + } else { + s = blobdb->Put(write_options_, key, val); + } +#endif // ROCKSDB_LITE + } else if (FLAGS_num_column_families <= 1) { + batch.Put(key, val); + } else { + // We use same rand_num as seed for key and column family so that we + // can deterministically find the cfh corresponding to a particular + // key while reading the key. + batch.Put(db_with_cfh->GetCfh(rand_num), key, val); + } + batch_bytes += val.size() + key_size_ + user_timestamp_size_; + bytes += val.size() + key_size_ + user_timestamp_size_; + ++num_written; + + // If all disposable entries have been inserted, then we need to + // add in the job queue a call for 'persistent entry insertions + + // disposable entry deletions'. + if (kNumDispAndPersEntries > 0 && is_disposable_entry && + ((disposable_entries_index[id] % kNumDispAndPersEntries) == 0)) { + // Queue contains [timestamp, starting_idx], + // timestamp = current_time + delay (minimum aboslute time when to + // start inserting the selective deletes) starting_idx = index in the + // keygen of the rand_num to generate the key of the first KV entry to + // delete (= key of the first selective delete). + disposable_entries_q[id].push(std::make_pair( + FLAGS_env->NowMicros() + + FLAGS_disposable_entries_delete_delay /* timestamp */, + disposable_entries_index[id] - kNumDispAndPersEntries + /*starting idx*/)); + } + if (writes_per_range_tombstone_ > 0 && + num_written > writes_before_delete_range_ && + (num_written - writes_before_delete_range_) / + writes_per_range_tombstone_ <= + max_num_range_tombstones_ && + (num_written - writes_before_delete_range_) % + writes_per_range_tombstone_ == + 0) { + num_range_deletions++; + int64_t begin_num = key_gens[id]->Next(); + if (FLAGS_expand_range_tombstones) { + for (int64_t offset = 0; offset < range_tombstone_width_; + ++offset) { + GenerateKeyFromInt(begin_num + offset, FLAGS_num, + &expanded_keys[offset]); + if (use_blob_db_) { +#ifndef ROCKSDB_LITE + // Stacked BlobDB + s = db_with_cfh->db->Delete(write_options_, + expanded_keys[offset]); +#endif // ROCKSDB_LITE + } else if (FLAGS_num_column_families <= 1) { + batch.Delete(expanded_keys[offset]); + } else { + batch.Delete(db_with_cfh->GetCfh(rand_num), + expanded_keys[offset]); + } + } + } else { + GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key); + GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num, + &end_key); + if (use_blob_db_) { +#ifndef ROCKSDB_LITE + // Stacked BlobDB + s = db_with_cfh->db->DeleteRange( + write_options_, db_with_cfh->db->DefaultColumnFamily(), + begin_key, end_key); +#endif // ROCKSDB_LITE + } else if (FLAGS_num_column_families <= 1) { + batch.DeleteRange(begin_key, end_key); + } else { + batch.DeleteRange(db_with_cfh->GetCfh(rand_num), begin_key, + end_key); + } + } + } + } + if (thread->shared->write_rate_limiter.get() != nullptr) { + thread->shared->write_rate_limiter->Request( + batch_bytes, Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kWrite); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); + } + if (user_timestamp_size_ > 0) { + Slice user_ts = mock_app_clock_->Allocate(ts_guard.get()); + s = batch.UpdateTimestamps( + user_ts, [this](uint32_t) { return user_timestamp_size_; }); + if (!s.ok()) { + fprintf(stderr, "assign timestamp to write batch: %s\n", + s.ToString().c_str()); + ErrorExit(); + } + } + if (!use_blob_db_) { + // Not stacked BlobDB + s = db_with_cfh->db->Write(write_options_, &batch); + } + thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, + entries_per_batch_, kWrite); + if (FLAGS_sine_write_rate) { + uint64_t now = FLAGS_env->NowMicros(); + + uint64_t usecs_since_last; + if (now > thread->stats.GetSineInterval()) { + usecs_since_last = now - thread->stats.GetSineInterval(); + } else { + usecs_since_last = 0; + } + + if (usecs_since_last > + (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) { + double usecs_since_start = + static_cast<double>(now - thread->stats.GetStart()); + thread->stats.ResetSineInterval(); + uint64_t write_rate = + static_cast<uint64_t>(SineRate(usecs_since_start / 1000000.0)); + thread->shared->write_rate_limiter.reset( + NewGenericRateLimiter(write_rate)); + } + } + if (!s.ok()) { + s = listener_->WaitForRecovery(600000000) ? Status::OK() : s; + } + + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + ErrorExit(); + } + } + if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) { + fprintf(stdout, + "Number of unique keys inserted: %" PRIu64 + ".\nNumber of overwrites: %" PRIu64 "\n", + num_unique_keys, num_overwrites); + } else if (kNumDispAndPersEntries > 0) { + fprintf(stdout, + "Number of unique keys inserted (disposable+persistent): %" PRIu64 + ".\nNumber of 'disposable entry delete': %" PRIu64 "\n", + num_written, num_selective_deletes); + } + if (num_range_deletions > 0) { + std::cout << "Number of range deletions: " << num_range_deletions + << std::endl; + } + thread->stats.AddBytes(bytes); + } + + Status DoDeterministicCompact(ThreadState* thread, + CompactionStyle compaction_style, + WriteMode write_mode) { +#ifndef ROCKSDB_LITE + ColumnFamilyMetaData meta; + std::vector<DB*> db_list; + if (db_.db != nullptr) { + db_list.push_back(db_.db); + } else { + for (auto& db : multi_dbs_) { + db_list.push_back(db.db); + } + } + std::vector<Options> options_list; + for (auto db : db_list) { + options_list.push_back(db->GetOptions()); + if (compaction_style != kCompactionStyleFIFO) { + db->SetOptions({{"disable_auto_compactions", "1"}, + {"level0_slowdown_writes_trigger", "400000000"}, + {"level0_stop_writes_trigger", "400000000"}}); + } else { + db->SetOptions({{"disable_auto_compactions", "1"}}); + } + } + + assert(!db_list.empty()); + auto num_db = db_list.size(); + size_t num_levels = static_cast<size_t>(open_options_.num_levels); + size_t output_level = open_options_.num_levels - 1; + std::vector<std::vector<std::vector<SstFileMetaData>>> sorted_runs(num_db); + std::vector<size_t> num_files_at_level0(num_db, 0); + if (compaction_style == kCompactionStyleLevel) { + if (num_levels == 0) { + return Status::InvalidArgument("num_levels should be larger than 1"); + } + bool should_stop = false; + while (!should_stop) { + if (sorted_runs[0].empty()) { + DoWrite(thread, write_mode); + } else { + DoWrite(thread, UNIQUE_RANDOM); + } + for (size_t i = 0; i < num_db; i++) { + auto db = db_list[i]; + db->Flush(FlushOptions()); + db->GetColumnFamilyMetaData(&meta); + if (num_files_at_level0[i] == meta.levels[0].files.size() || + writes_ == 0) { + should_stop = true; + continue; + } + sorted_runs[i].emplace_back( + meta.levels[0].files.begin(), + meta.levels[0].files.end() - num_files_at_level0[i]); + num_files_at_level0[i] = meta.levels[0].files.size(); + if (sorted_runs[i].back().size() == 1) { + should_stop = true; + continue; + } + if (sorted_runs[i].size() == output_level) { + auto& L1 = sorted_runs[i].back(); + L1.erase(L1.begin(), L1.begin() + L1.size() / 3); + should_stop = true; + continue; + } + } + writes_ /= + static_cast<int64_t>(open_options_.max_bytes_for_level_multiplier); + } + for (size_t i = 0; i < num_db; i++) { + if (sorted_runs[i].size() < num_levels - 1) { + fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", + num_levels); + exit(1); + } + } + for (size_t i = 0; i < num_db; i++) { + auto db = db_list[i]; + auto compactionOptions = CompactionOptions(); + compactionOptions.compression = FLAGS_compression_type_e; + auto options = db->GetOptions(); + MutableCFOptions mutable_cf_options(options); + for (size_t j = 0; j < sorted_runs[i].size(); j++) { + compactionOptions.output_file_size_limit = MaxFileSizeForLevel( + mutable_cf_options, static_cast<int>(output_level), + compaction_style); + std::cout << sorted_runs[i][j].size() << std::endl; + db->CompactFiles( + compactionOptions, + {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name}, + static_cast<int>(output_level - j) /*level*/); + } + } + } else if (compaction_style == kCompactionStyleUniversal) { + auto ratio = open_options_.compaction_options_universal.size_ratio; + bool should_stop = false; + while (!should_stop) { + if (sorted_runs[0].empty()) { + DoWrite(thread, write_mode); + } else { + DoWrite(thread, UNIQUE_RANDOM); + } + for (size_t i = 0; i < num_db; i++) { + auto db = db_list[i]; + db->Flush(FlushOptions()); + db->GetColumnFamilyMetaData(&meta); + if (num_files_at_level0[i] == meta.levels[0].files.size() || + writes_ == 0) { + should_stop = true; + continue; + } + sorted_runs[i].emplace_back( + meta.levels[0].files.begin(), + meta.levels[0].files.end() - num_files_at_level0[i]); + num_files_at_level0[i] = meta.levels[0].files.size(); + if (sorted_runs[i].back().size() == 1) { + should_stop = true; + continue; + } + num_files_at_level0[i] = meta.levels[0].files.size(); + } + writes_ = static_cast<int64_t>(writes_ * static_cast<double>(100) / + (ratio + 200)); + } + for (size_t i = 0; i < num_db; i++) { + if (sorted_runs[i].size() < num_levels) { + fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", + num_levels); + exit(1); + } + } + for (size_t i = 0; i < num_db; i++) { + auto db = db_list[i]; + auto compactionOptions = CompactionOptions(); + compactionOptions.compression = FLAGS_compression_type_e; + auto options = db->GetOptions(); + MutableCFOptions mutable_cf_options(options); + for (size_t j = 0; j < sorted_runs[i].size(); j++) { + compactionOptions.output_file_size_limit = MaxFileSizeForLevel( + mutable_cf_options, static_cast<int>(output_level), + compaction_style); + db->CompactFiles( + compactionOptions, + {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name}, + (output_level > j ? static_cast<int>(output_level - j) + : 0) /*level*/); + } + } + } else if (compaction_style == kCompactionStyleFIFO) { + if (num_levels != 1) { + return Status::InvalidArgument( + "num_levels should be 1 for FIFO compaction"); + } + if (FLAGS_num_multi_db != 0) { + return Status::InvalidArgument("Doesn't support multiDB"); + } + auto db = db_list[0]; + std::vector<std::string> file_names; + while (true) { + if (sorted_runs[0].empty()) { + DoWrite(thread, write_mode); + } else { + DoWrite(thread, UNIQUE_RANDOM); + } + db->Flush(FlushOptions()); + db->GetColumnFamilyMetaData(&meta); + auto total_size = meta.levels[0].size; + if (total_size >= + db->GetOptions().compaction_options_fifo.max_table_files_size) { + for (auto file_meta : meta.levels[0].files) { + file_names.emplace_back(file_meta.name); + } + break; + } + } + // TODO(shuzhang1989): Investigate why CompactFiles not working + // auto compactionOptions = CompactionOptions(); + // db->CompactFiles(compactionOptions, file_names, 0); + auto compactionOptions = CompactRangeOptions(); + db->CompactRange(compactionOptions, nullptr, nullptr); + } else { + fprintf(stdout, + "%-12s : skipped (-compaction_stype=kCompactionStyleNone)\n", + "filldeterministic"); + return Status::InvalidArgument("None compaction is not supported"); + } + +// Verify seqno and key range +// Note: the seqno get changed at the max level by implementation +// optimization, so skip the check of the max level. +#ifndef NDEBUG + for (size_t k = 0; k < num_db; k++) { + auto db = db_list[k]; + db->GetColumnFamilyMetaData(&meta); + // verify the number of sorted runs + if (compaction_style == kCompactionStyleLevel) { + assert(num_levels - 1 == sorted_runs[k].size()); + } else if (compaction_style == kCompactionStyleUniversal) { + assert(meta.levels[0].files.size() + num_levels - 1 == + sorted_runs[k].size()); + } else if (compaction_style == kCompactionStyleFIFO) { + // TODO(gzh): FIFO compaction + db->GetColumnFamilyMetaData(&meta); + auto total_size = meta.levels[0].size; + assert(total_size <= + db->GetOptions().compaction_options_fifo.max_table_files_size); + break; + } + + // verify smallest/largest seqno and key range of each sorted run + auto max_level = num_levels - 1; + int level; + for (size_t i = 0; i < sorted_runs[k].size(); i++) { + level = static_cast<int>(max_level - i); + SequenceNumber sorted_run_smallest_seqno = kMaxSequenceNumber; + SequenceNumber sorted_run_largest_seqno = 0; + std::string sorted_run_smallest_key, sorted_run_largest_key; + bool first_key = true; + for (auto fileMeta : sorted_runs[k][i]) { + sorted_run_smallest_seqno = + std::min(sorted_run_smallest_seqno, fileMeta.smallest_seqno); + sorted_run_largest_seqno = + std::max(sorted_run_largest_seqno, fileMeta.largest_seqno); + if (first_key || + db->DefaultColumnFamily()->GetComparator()->Compare( + fileMeta.smallestkey, sorted_run_smallest_key) < 0) { + sorted_run_smallest_key = fileMeta.smallestkey; + } + if (first_key || + db->DefaultColumnFamily()->GetComparator()->Compare( + fileMeta.largestkey, sorted_run_largest_key) > 0) { + sorted_run_largest_key = fileMeta.largestkey; + } + first_key = false; + } + if (compaction_style == kCompactionStyleLevel || + (compaction_style == kCompactionStyleUniversal && level > 0)) { + SequenceNumber level_smallest_seqno = kMaxSequenceNumber; + SequenceNumber level_largest_seqno = 0; + for (auto fileMeta : meta.levels[level].files) { + level_smallest_seqno = + std::min(level_smallest_seqno, fileMeta.smallest_seqno); + level_largest_seqno = + std::max(level_largest_seqno, fileMeta.largest_seqno); + } + assert(sorted_run_smallest_key == + meta.levels[level].files.front().smallestkey); + assert(sorted_run_largest_key == + meta.levels[level].files.back().largestkey); + if (level != static_cast<int>(max_level)) { + // compaction at max_level would change sequence number + assert(sorted_run_smallest_seqno == level_smallest_seqno); + assert(sorted_run_largest_seqno == level_largest_seqno); + } + } else if (compaction_style == kCompactionStyleUniversal) { + // level <= 0 means sorted runs on level 0 + auto level0_file = + meta.levels[0].files[sorted_runs[k].size() - 1 - i]; + assert(sorted_run_smallest_key == level0_file.smallestkey); + assert(sorted_run_largest_key == level0_file.largestkey); + if (level != static_cast<int>(max_level)) { + assert(sorted_run_smallest_seqno == level0_file.smallest_seqno); + assert(sorted_run_largest_seqno == level0_file.largest_seqno); + } + } + } + } +#endif + // print the size of each sorted_run + for (size_t k = 0; k < num_db; k++) { + auto db = db_list[k]; + fprintf(stdout, + "---------------------- DB %" ROCKSDB_PRIszt + " LSM ---------------------\n", + k); + db->GetColumnFamilyMetaData(&meta); + for (auto& levelMeta : meta.levels) { + if (levelMeta.files.empty()) { + continue; + } + if (levelMeta.level == 0) { + for (auto& fileMeta : levelMeta.files) { + fprintf(stdout, "Level[%d]: %s(size: %" PRIi64 " bytes)\n", + levelMeta.level, fileMeta.name.c_str(), fileMeta.size); + } + } else { + fprintf(stdout, "Level[%d]: %s - %s(total size: %" PRIi64 " bytes)\n", + levelMeta.level, levelMeta.files.front().name.c_str(), + levelMeta.files.back().name.c_str(), levelMeta.size); + } + } + } + for (size_t i = 0; i < num_db; i++) { + db_list[i]->SetOptions( + {{"disable_auto_compactions", + std::to_string(options_list[i].disable_auto_compactions)}, + {"level0_slowdown_writes_trigger", + std::to_string(options_list[i].level0_slowdown_writes_trigger)}, + {"level0_stop_writes_trigger", + std::to_string(options_list[i].level0_stop_writes_trigger)}}); + } + return Status::OK(); +#else + (void)thread; + (void)compaction_style; + (void)write_mode; + fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n"); + return Status::NotSupported( + "Rocksdb Lite doesn't support filldeterministic"); +#endif // ROCKSDB_LITE + } + + void ReadSequential(ThreadState* thread) { + if (db_.db != nullptr) { + ReadSequential(thread, db_.db); + } else { + for (const auto& db_with_cfh : multi_dbs_) { + ReadSequential(thread, db_with_cfh.db); + } + } + } + + void ReadSequential(ThreadState* thread, DB* db) { + ReadOptions options = read_options_; + std::unique_ptr<char[]> ts_guard; + Slice ts; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); + options.timestamp = &ts; + } + + options.adaptive_readahead = FLAGS_adaptive_readahead; + options.async_io = FLAGS_async_io; + + Iterator* iter = db->NewIterator(options); + int64_t i = 0; + int64_t bytes = 0; + for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) { + bytes += iter->key().size() + iter->value().size(); + thread->stats.FinishedOps(nullptr, db, 1, kRead); + ++i; + + if (thread->shared->read_rate_limiter.get() != nullptr && + i % 1024 == 1023) { + thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH, + nullptr /* stats */, + RateLimiter::OpType::kRead); + } + } + + delete iter; + thread->stats.AddBytes(bytes); + } + + void ReadToRowCache(ThreadState* thread) { + int64_t read = 0; + int64_t found = 0; + int64_t bytes = 0; + int64_t key_rand = 0; + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + PinnableSlice pinnable_val; + + while (key_rand < FLAGS_num) { + DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread); + // We use same key_rand as seed for key and column family so that we can + // deterministically find the cfh corresponding to a particular key, as it + // is done in DoWrite method. + GenerateKeyFromInt(key_rand, FLAGS_num, &key); + key_rand++; + read++; + Status s; + if (FLAGS_num_column_families > 1) { + s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand), + key, &pinnable_val); + } else { + pinnable_val.Reset(); + s = db_with_cfh->db->Get(read_options_, + db_with_cfh->db->DefaultColumnFamily(), key, + &pinnable_val); + } + + if (s.ok()) { + found++; + bytes += key.size() + pinnable_val.size(); + } else if (!s.IsNotFound()) { + fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str()); + abort(); + } + + if (thread->shared->read_rate_limiter.get() != nullptr && + read % 256 == 255) { + thread->shared->read_rate_limiter->Request( + 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); + } + + thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead); + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found, + read); + + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(msg); + } + + void ReadReverse(ThreadState* thread) { + if (db_.db != nullptr) { + ReadReverse(thread, db_.db); + } else { + for (const auto& db_with_cfh : multi_dbs_) { + ReadReverse(thread, db_with_cfh.db); + } + } + } + + void ReadReverse(ThreadState* thread, DB* db) { + Iterator* iter = db->NewIterator(read_options_); + int64_t i = 0; + int64_t bytes = 0; + for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) { + bytes += iter->key().size() + iter->value().size(); + thread->stats.FinishedOps(nullptr, db, 1, kRead); + ++i; + if (thread->shared->read_rate_limiter.get() != nullptr && + i % 1024 == 1023) { + thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH, + nullptr /* stats */, + RateLimiter::OpType::kRead); + } + } + delete iter; + thread->stats.AddBytes(bytes); + } + + void ReadRandomFast(ThreadState* thread) { + int64_t read = 0; + int64_t found = 0; + int64_t nonexist = 0; + ReadOptions options = read_options_; + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + std::string value; + Slice ts; + std::unique_ptr<char[]> ts_guard; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + } + DB* db = SelectDBWithCfh(thread)->db; + + int64_t pot = 1; + while (pot < FLAGS_num) { + pot <<= 1; + } + + Duration duration(FLAGS_duration, reads_); + do { + for (int i = 0; i < 100; ++i) { + int64_t key_rand = thread->rand.Next() & (pot - 1); + GenerateKeyFromInt(key_rand, FLAGS_num, &key); + ++read; + std::string ts_ret; + std::string* ts_ptr = nullptr; + if (user_timestamp_size_ > 0) { + ts = mock_app_clock_->GetTimestampForRead(thread->rand, + ts_guard.get()); + options.timestamp = &ts; + ts_ptr = &ts_ret; + } + auto status = db->Get(options, key, &value, ts_ptr); + if (status.ok()) { + ++found; + } else if (!status.IsNotFound()) { + fprintf(stderr, "Get returned an error: %s\n", + status.ToString().c_str()); + abort(); + } + if (key_rand >= FLAGS_num) { + ++nonexist; + } + } + if (thread->shared->read_rate_limiter.get() != nullptr) { + thread->shared->read_rate_limiter->Request( + 100, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); + } + + thread->stats.FinishedOps(nullptr, db, 100, kRead); + } while (!duration.Done(100)); + + char msg[100]; + snprintf(msg, sizeof(msg), + "(%" PRIu64 " of %" PRIu64 + " found, " + "issued %" PRIu64 " non-exist keys)\n", + found, read, nonexist); + + thread->stats.AddMessage(msg); + } + + int64_t GetRandomKey(Random64* rand) { + uint64_t rand_int = rand->Next(); + int64_t key_rand; + if (read_random_exp_range_ == 0) { + key_rand = rand_int % FLAGS_num; + } else { + const uint64_t kBigInt = static_cast<uint64_t>(1U) << 62; + long double order = -static_cast<long double>(rand_int % kBigInt) / + static_cast<long double>(kBigInt) * + read_random_exp_range_; + long double exp_ran = std::exp(order); + uint64_t rand_num = + static_cast<int64_t>(exp_ran * static_cast<long double>(FLAGS_num)); + // Map to a different number to avoid locality. + const uint64_t kBigPrime = 0x5bd1e995; + // Overflow is like %(2^64). Will have little impact of results. + key_rand = static_cast<int64_t>((rand_num * kBigPrime) % FLAGS_num); + } + return key_rand; + } + + void ReadRandom(ThreadState* thread) { + int64_t read = 0; + int64_t found = 0; + int64_t bytes = 0; + int num_keys = 0; + int64_t key_rand = 0; + ReadOptions options = read_options_; + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + PinnableSlice pinnable_val; + std::vector<PinnableSlice> pinnable_vals; + if (read_operands_) { + // Start off with a small-ish value that'll be increased later if + // `GetMergeOperands()` tells us it is not large enough. + pinnable_vals.resize(8); + } + std::unique_ptr<char[]> ts_guard; + Slice ts; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + } + + Duration duration(FLAGS_duration, reads_); + while (!duration.Done(1)) { + DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread); + // We use same key_rand as seed for key and column family so that we can + // deterministically find the cfh corresponding to a particular key, as it + // is done in DoWrite method. + if (entries_per_batch_ > 1 && FLAGS_multiread_stride) { + if (++num_keys == entries_per_batch_) { + num_keys = 0; + key_rand = GetRandomKey(&thread->rand); + if ((key_rand + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >= + FLAGS_num) { + key_rand = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride; + } + } else { + key_rand += FLAGS_multiread_stride; + } + } else { + key_rand = GetRandomKey(&thread->rand); + } + GenerateKeyFromInt(key_rand, FLAGS_num, &key); + read++; + std::string ts_ret; + std::string* ts_ptr = nullptr; + if (user_timestamp_size_ > 0) { + ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); + options.timestamp = &ts; + ts_ptr = &ts_ret; + } + Status s; + pinnable_val.Reset(); + for (size_t i = 0; i < pinnable_vals.size(); ++i) { + pinnable_vals[i].Reset(); + } + ColumnFamilyHandle* cfh; + if (FLAGS_num_column_families > 1) { + cfh = db_with_cfh->GetCfh(key_rand); + } else { + cfh = db_with_cfh->db->DefaultColumnFamily(); + } + if (read_operands_) { + GetMergeOperandsOptions get_merge_operands_options; + get_merge_operands_options.expected_max_number_of_operands = + static_cast<int>(pinnable_vals.size()); + int number_of_operands; + s = db_with_cfh->db->GetMergeOperands( + options, cfh, key, pinnable_vals.data(), + &get_merge_operands_options, &number_of_operands); + if (s.IsIncomplete()) { + // Should only happen a few times when we encounter a key that had + // more merge operands than any key seen so far. Production use case + // would typically retry in such event to get all the operands so do + // that here. + pinnable_vals.resize(number_of_operands); + get_merge_operands_options.expected_max_number_of_operands = + static_cast<int>(pinnable_vals.size()); + s = db_with_cfh->db->GetMergeOperands( + options, cfh, key, pinnable_vals.data(), + &get_merge_operands_options, &number_of_operands); + } + } else { + s = db_with_cfh->db->Get(options, cfh, key, &pinnable_val, ts_ptr); + } + + if (s.ok()) { + found++; + bytes += key.size() + pinnable_val.size() + user_timestamp_size_; + for (size_t i = 0; i < pinnable_vals.size(); ++i) { + bytes += pinnable_vals[i].size(); + pinnable_vals[i].Reset(); + } + } else if (!s.IsNotFound()) { + fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str()); + abort(); + } + + if (thread->shared->read_rate_limiter.get() != nullptr && + read % 256 == 255) { + thread->shared->read_rate_limiter->Request( + 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); + } + + thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead); + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found, + read); + + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(msg); + } + + // Calls MultiGet over a list of keys from a random distribution. + // Returns the total number of keys found. + void MultiReadRandom(ThreadState* thread) { + int64_t read = 0; + int64_t bytes = 0; + int64_t num_multireads = 0; + int64_t found = 0; + ReadOptions options = read_options_; + std::vector<Slice> keys; + std::vector<std::unique_ptr<const char[]>> key_guards; + std::vector<std::string> values(entries_per_batch_); + PinnableSlice* pin_values = new PinnableSlice[entries_per_batch_]; + std::unique_ptr<PinnableSlice[]> pin_values_guard(pin_values); + std::vector<Status> stat_list(entries_per_batch_); + while (static_cast<int64_t>(keys.size()) < entries_per_batch_) { + key_guards.push_back(std::unique_ptr<const char[]>()); + keys.push_back(AllocateKey(&key_guards.back())); + } + + std::unique_ptr<char[]> ts_guard; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + } + + Duration duration(FLAGS_duration, reads_); + while (!duration.Done(entries_per_batch_)) { + DB* db = SelectDB(thread); + if (FLAGS_multiread_stride) { + int64_t key = GetRandomKey(&thread->rand); + if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >= + static_cast<int64_t>(FLAGS_num)) { + key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride; + } + for (int64_t i = 0; i < entries_per_batch_; ++i) { + GenerateKeyFromInt(key, FLAGS_num, &keys[i]); + key += FLAGS_multiread_stride; + } + } else { + for (int64_t i = 0; i < entries_per_batch_; ++i) { + GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]); + } + } + Slice ts; + if (user_timestamp_size_ > 0) { + ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); + options.timestamp = &ts; + } + if (!FLAGS_multiread_batched) { + std::vector<Status> statuses = db->MultiGet(options, keys, &values); + assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_); + + read += entries_per_batch_; + num_multireads++; + for (int64_t i = 0; i < entries_per_batch_; ++i) { + if (statuses[i].ok()) { + bytes += keys[i].size() + values[i].size() + user_timestamp_size_; + ++found; + } else if (!statuses[i].IsNotFound()) { + fprintf(stderr, "MultiGet returned an error: %s\n", + statuses[i].ToString().c_str()); + abort(); + } + } + } else { + db->MultiGet(options, db->DefaultColumnFamily(), keys.size(), + keys.data(), pin_values, stat_list.data()); + + read += entries_per_batch_; + num_multireads++; + for (int64_t i = 0; i < entries_per_batch_; ++i) { + if (stat_list[i].ok()) { + bytes += + keys[i].size() + pin_values[i].size() + user_timestamp_size_; + ++found; + } else if (!stat_list[i].IsNotFound()) { + fprintf(stderr, "MultiGet returned an error: %s\n", + stat_list[i].ToString().c_str()); + abort(); + } + stat_list[i] = Status::OK(); + pin_values[i].Reset(); + } + } + if (thread->shared->read_rate_limiter.get() != nullptr && + num_multireads % 256 == 255) { + thread->shared->read_rate_limiter->Request( + 256 * entries_per_batch_, Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kRead); + } + thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kRead); + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found, + read); + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(msg); + } + + // Calls ApproximateSize over random key ranges. + void ApproximateSizeRandom(ThreadState* thread) { + int64_t size_sum = 0; + int64_t num_sizes = 0; + const size_t batch_size = entries_per_batch_; + std::vector<Range> ranges; + std::vector<Slice> lkeys; + std::vector<std::unique_ptr<const char[]>> lkey_guards; + std::vector<Slice> rkeys; + std::vector<std::unique_ptr<const char[]>> rkey_guards; + std::vector<uint64_t> sizes; + while (ranges.size() < batch_size) { + // Ugly without C++17 return from emplace_back + lkey_guards.emplace_back(); + rkey_guards.emplace_back(); + lkeys.emplace_back(AllocateKey(&lkey_guards.back())); + rkeys.emplace_back(AllocateKey(&rkey_guards.back())); + ranges.emplace_back(lkeys.back(), rkeys.back()); + sizes.push_back(0); + } + Duration duration(FLAGS_duration, reads_); + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + for (size_t i = 0; i < batch_size; ++i) { + int64_t lkey = GetRandomKey(&thread->rand); + int64_t rkey = GetRandomKey(&thread->rand); + if (lkey > rkey) { + std::swap(lkey, rkey); + } + GenerateKeyFromInt(lkey, FLAGS_num, &lkeys[i]); + GenerateKeyFromInt(rkey, FLAGS_num, &rkeys[i]); + } + db->GetApproximateSizes(&ranges[0], static_cast<int>(entries_per_batch_), + &sizes[0]); + num_sizes += entries_per_batch_; + for (int64_t size : sizes) { + size_sum += size; + } + thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kOthers); + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(Avg approx size=%g)", + static_cast<double>(size_sum) / static_cast<double>(num_sizes)); + thread->stats.AddMessage(msg); + } + + // The inverse function of Pareto distribution + int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) { + double ret; + if (k == 0.0) { + ret = theta - sigma * std::log(u); + } else { + ret = theta + sigma * (std::pow(u, -1 * k) - 1) / k; + } + return static_cast<int64_t>(ceil(ret)); + } + // The inverse function of power distribution (y=ax^b) + int64_t PowerCdfInversion(double u, double a, double b) { + double ret; + ret = std::pow((u / a), (1 / b)); + return static_cast<int64_t>(ceil(ret)); + } + + // Add the noice to the QPS + double AddNoise(double origin, double noise_ratio) { + if (noise_ratio < 0.0 || noise_ratio > 1.0) { + return origin; + } + int band_int = static_cast<int>(FLAGS_sine_a); + double delta = (rand() % band_int - band_int / 2) * noise_ratio; + if (origin + delta < 0) { + return origin; + } else { + return (origin + delta); + } + } + + // Decide the ratio of different query types + // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge + class QueryDecider { + public: + std::vector<int> type_; + std::vector<double> ratio_; + int range_; + + QueryDecider() {} + ~QueryDecider() {} + + Status Initiate(std::vector<double> ratio_input) { + int range_max = 1000; + double sum = 0.0; + for (auto& ratio : ratio_input) { + sum += ratio; + } + range_ = 0; + for (auto& ratio : ratio_input) { + range_ += static_cast<int>(ceil(range_max * (ratio / sum))); + type_.push_back(range_); + ratio_.push_back(ratio / sum); + } + return Status::OK(); + } + + int GetType(int64_t rand_num) { + if (rand_num < 0) { + rand_num = rand_num * (-1); + } + assert(range_ != 0); + int pos = static_cast<int>(rand_num % range_); + for (int i = 0; i < static_cast<int>(type_.size()); i++) { + if (pos < type_[i]) { + return i; + } + } + return 0; + } + }; + + // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector + // to transfer a random value to one keyrange based on the hotness. + struct KeyrangeUnit { + int64_t keyrange_start; + int64_t keyrange_access; + int64_t keyrange_keys; + }; + + // From our observations, the prefix hotness (key-range hotness) follows + // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x). + // However, we cannot directly use the inverse function to decide a + // key-range from a random distribution. To achieve it, we create a list of + // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is + // decided based on the hotness of the key-range. When a random value is + // generated based on uniform distribution, we map it to the KeyrangeUnit Vec + // and one KeyrangeUnit is selected. The probability of a KeyrangeUnit being + // selected is the same as the hotness of this KeyrangeUnit. After that, the + // key can be randomly allocated to the key-range of this KeyrangeUnit, or we + // can based on the power distribution (y=ax^b) to generate the offset of + // the key in the selected key-range. In this way, we generate the keyID + // based on the hotness of the prefix and also the key hotness distribution. + class GenerateTwoTermExpKeys { + public: + // Avoid uninitialized warning-as-error in some compilers + int64_t keyrange_rand_max_ = 0; + int64_t keyrange_size_ = 0; + int64_t keyrange_num_ = 0; + std::vector<KeyrangeUnit> keyrange_set_; + + // Initiate the KeyrangeUnit vector and calculate the size of each + // KeyrangeUnit. + Status InitiateExpDistribution(int64_t total_keys, double prefix_a, + double prefix_b, double prefix_c, + double prefix_d) { + int64_t amplify = 0; + int64_t keyrange_start = 0; + if (FLAGS_keyrange_num <= 0) { + keyrange_num_ = 1; + } else { + keyrange_num_ = FLAGS_keyrange_num; + } + keyrange_size_ = total_keys / keyrange_num_; + + // Calculate the key-range shares size based on the input parameters + for (int64_t pfx = keyrange_num_; pfx >= 1; pfx--) { + // Step 1. Calculate the probability that this key range will be + // accessed in a query. It is based on the two-term expoential + // distribution + double keyrange_p = prefix_a * std::exp(prefix_b * pfx) + + prefix_c * std::exp(prefix_d * pfx); + if (keyrange_p < std::pow(10.0, -16.0)) { + keyrange_p = 0.0; + } + // Step 2. Calculate the amplify + // In order to allocate a query to a key-range based on the random + // number generated for this query, we need to extend the probability + // of each key range from [0,1] to [0, amplify]. Amplify is calculated + // by 1/(smallest key-range probability). In this way, we ensure that + // all key-ranges are assigned with an Integer that >=0 + if (amplify == 0 && keyrange_p > 0) { + amplify = static_cast<int64_t>(std::floor(1 / keyrange_p)) + 1; + } + + // Step 3. For each key-range, we calculate its position in the + // [0, amplify] range, including the start, the size (keyrange_access) + KeyrangeUnit p_unit; + p_unit.keyrange_start = keyrange_start; + if (0.0 >= keyrange_p) { + p_unit.keyrange_access = 0; + } else { + p_unit.keyrange_access = + static_cast<int64_t>(std::floor(amplify * keyrange_p)); + } + p_unit.keyrange_keys = keyrange_size_; + keyrange_set_.push_back(p_unit); + keyrange_start += p_unit.keyrange_access; + } + keyrange_rand_max_ = keyrange_start; + + // Step 4. Shuffle the key-ranges randomly + // Since the access probability is calculated from small to large, + // If we do not re-allocate them, hot key-ranges are always at the end + // and cold key-ranges are at the begin of the key space. Therefore, the + // key-ranges are shuffled and the rand seed is only decide by the + // key-range hotness distribution. With the same distribution parameters + // the shuffle results are the same. + Random64 rand_loca(keyrange_rand_max_); + for (int64_t i = 0; i < FLAGS_keyrange_num; i++) { + int64_t pos = rand_loca.Next() % FLAGS_keyrange_num; + assert(i >= 0 && i < static_cast<int64_t>(keyrange_set_.size()) && + pos >= 0 && pos < static_cast<int64_t>(keyrange_set_.size())); + std::swap(keyrange_set_[i], keyrange_set_[pos]); + } + + // Step 5. Recalculate the prefix start postion after shuffling + int64_t offset = 0; + for (auto& p_unit : keyrange_set_) { + p_unit.keyrange_start = offset; + offset += p_unit.keyrange_access; + } + + return Status::OK(); + } + + // Generate the Key ID according to the input ini_rand and key distribution + int64_t DistGetKeyID(int64_t ini_rand, double key_dist_a, + double key_dist_b) { + int64_t keyrange_rand = ini_rand % keyrange_rand_max_; + + // Calculate and select one key-range that contains the new key + int64_t start = 0, end = static_cast<int64_t>(keyrange_set_.size()); + while (start + 1 < end) { + int64_t mid = start + (end - start) / 2; + assert(mid >= 0 && mid < static_cast<int64_t>(keyrange_set_.size())); + if (keyrange_rand < keyrange_set_[mid].keyrange_start) { + end = mid; + } else { + start = mid; + } + } + int64_t keyrange_id = start; + + // Select one key in the key-range and compose the keyID + int64_t key_offset = 0, key_seed; + if (key_dist_a == 0.0 || key_dist_b == 0.0) { + key_offset = ini_rand % keyrange_size_; + } else { + double u = + static_cast<double>(ini_rand % keyrange_size_) / keyrange_size_; + key_seed = static_cast<int64_t>( + ceil(std::pow((u / key_dist_a), (1 / key_dist_b)))); + Random64 rand_key(key_seed); + key_offset = rand_key.Next() % keyrange_size_; + } + return keyrange_size_ * keyrange_id + key_offset; + } + }; + + // The social graph workload mixed with Get, Put, Iterator queries. + // The value size and iterator length follow Pareto distribution. + // The overall key access follow power distribution. If user models the + // workload based on different key-ranges (or different prefixes), user + // can use two-term-exponential distribution to fit the workload. User + // needs to decide the ratio between Get, Put, Iterator queries before + // starting the benchmark. + void MixGraph(ThreadState* thread) { + int64_t gets = 0; + int64_t puts = 0; + int64_t get_found = 0; + int64_t seek = 0; + int64_t seek_found = 0; + int64_t bytes = 0; + double total_scan_length = 0; + double total_val_size = 0; + const int64_t default_value_max = 1 * 1024 * 1024; + int64_t value_max = default_value_max; + int64_t scan_len_max = FLAGS_mix_max_scan_len; + double write_rate = 1000000.0; + double read_rate = 1000000.0; + bool use_prefix_modeling = false; + bool use_random_modeling = false; + GenerateTwoTermExpKeys gen_exp; + std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio, + FLAGS_mix_seek_ratio}; + char value_buffer[default_value_max]; + QueryDecider query; + RandomGenerator gen; + Status s; + if (value_max > FLAGS_mix_max_value_size) { + value_max = FLAGS_mix_max_value_size; + } + + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + PinnableSlice pinnable_val; + query.Initiate(ratio); + + // the limit of qps initiation + if (FLAGS_sine_mix_rate) { + thread->shared->read_rate_limiter.reset( + NewGenericRateLimiter(static_cast<int64_t>(read_rate))); + thread->shared->write_rate_limiter.reset( + NewGenericRateLimiter(static_cast<int64_t>(write_rate))); + } + + // Decide if user wants to use prefix based key generation + if (FLAGS_keyrange_dist_a != 0.0 || FLAGS_keyrange_dist_b != 0.0 || + FLAGS_keyrange_dist_c != 0.0 || FLAGS_keyrange_dist_d != 0.0) { + use_prefix_modeling = true; + gen_exp.InitiateExpDistribution( + FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b, + FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d); + } + if (FLAGS_key_dist_a == 0 || FLAGS_key_dist_b == 0) { + use_random_modeling = true; + } + + Duration duration(FLAGS_duration, reads_); + while (!duration.Done(1)) { + DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread); + int64_t ini_rand, rand_v, key_rand, key_seed; + ini_rand = GetRandomKey(&thread->rand); + rand_v = ini_rand % FLAGS_num; + double u = static_cast<double>(rand_v) / FLAGS_num; + + // Generate the keyID based on the key hotness and prefix hotness + if (use_random_modeling) { + key_rand = ini_rand; + } else if (use_prefix_modeling) { + key_rand = + gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b); + } else { + key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b); + Random64 rand(key_seed); + key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num; + } + GenerateKeyFromInt(key_rand, FLAGS_num, &key); + int query_type = query.GetType(rand_v); + + // change the qps + uint64_t now = FLAGS_env->NowMicros(); + uint64_t usecs_since_last; + if (now > thread->stats.GetSineInterval()) { + usecs_since_last = now - thread->stats.GetSineInterval(); + } else { + usecs_since_last = 0; + } + + if (FLAGS_sine_mix_rate && + usecs_since_last > + (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) { + double usecs_since_start = + static_cast<double>(now - thread->stats.GetStart()); + thread->stats.ResetSineInterval(); + double mix_rate_with_noise = AddNoise( + SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise); + read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]); + write_rate = mix_rate_with_noise * query.ratio_[1]; + + if (read_rate > 0) { + thread->shared->read_rate_limiter->SetBytesPerSecond( + static_cast<int64_t>(read_rate)); + } + if (write_rate > 0) { + thread->shared->write_rate_limiter->SetBytesPerSecond( + static_cast<int64_t>(write_rate)); + } + } + // Start the query + if (query_type == 0) { + // the Get query + gets++; + if (FLAGS_num_column_families > 1) { + s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand), + key, &pinnable_val); + } else { + pinnable_val.Reset(); + s = db_with_cfh->db->Get(read_options_, + db_with_cfh->db->DefaultColumnFamily(), key, + &pinnable_val); + } + + if (s.ok()) { + get_found++; + bytes += key.size() + pinnable_val.size(); + } else if (!s.IsNotFound()) { + fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str()); + abort(); + } + + if (thread->shared->read_rate_limiter && (gets + seek) % 100 == 0) { + thread->shared->read_rate_limiter->Request(100, Env::IO_HIGH, + nullptr /*stats*/); + } + thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead); + } else if (query_type == 1) { + // the Put query + puts++; + int64_t val_size = ParetoCdfInversion(u, FLAGS_value_theta, + FLAGS_value_k, FLAGS_value_sigma); + if (val_size < 10) { + val_size = 10; + } else if (val_size > value_max) { + val_size = val_size % value_max; + } + total_val_size += val_size; + + s = db_with_cfh->db->Put( + write_options_, key, + gen.Generate(static_cast<unsigned int>(val_size))); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + ErrorExit(); + } + + if (thread->shared->write_rate_limiter && puts % 100 == 0) { + thread->shared->write_rate_limiter->Request(100, Env::IO_HIGH, + nullptr /*stats*/); + } + thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite); + } else if (query_type == 2) { + // Seek query + if (db_with_cfh->db != nullptr) { + Iterator* single_iter = nullptr; + single_iter = db_with_cfh->db->NewIterator(read_options_); + if (single_iter != nullptr) { + single_iter->Seek(key); + seek++; + if (single_iter->Valid() && single_iter->key().compare(key) == 0) { + seek_found++; + } + int64_t scan_length = + ParetoCdfInversion(u, FLAGS_iter_theta, FLAGS_iter_k, + FLAGS_iter_sigma) % + scan_len_max; + for (int64_t j = 0; j < scan_length && single_iter->Valid(); j++) { + Slice value = single_iter->value(); + memcpy(value_buffer, value.data(), + std::min(value.size(), sizeof(value_buffer))); + bytes += single_iter->key().size() + single_iter->value().size(); + single_iter->Next(); + assert(single_iter->status().ok()); + total_scan_length++; + } + } + delete single_iter; + } + thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kSeek); + } + } + char msg[256]; + snprintf(msg, sizeof(msg), + "( Gets:%" PRIu64 " Puts:%" PRIu64 " Seek:%" PRIu64 + ", reads %" PRIu64 " in %" PRIu64 + " found, " + "avg size: %.1f value, %.1f scan)\n", + gets, puts, seek, get_found + seek_found, gets + seek, + total_val_size / puts, total_scan_length / seek); + + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(msg); + } + + void IteratorCreation(ThreadState* thread) { + Duration duration(FLAGS_duration, reads_); + ReadOptions options = read_options_; + std::unique_ptr<char[]> ts_guard; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + } + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + Slice ts; + if (user_timestamp_size_ > 0) { + ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); + options.timestamp = &ts; + } + Iterator* iter = db->NewIterator(options); + delete iter; + thread->stats.FinishedOps(nullptr, db, 1, kOthers); + } + } + + void IteratorCreationWhileWriting(ThreadState* thread) { + if (thread->tid > 0) { + IteratorCreation(thread); + } else { + BGWriter(thread, kWrite); + } + } + + void SeekRandom(ThreadState* thread) { + int64_t read = 0; + int64_t found = 0; + int64_t bytes = 0; + ReadOptions options = read_options_; + std::unique_ptr<char[]> ts_guard; + Slice ts; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); + options.timestamp = &ts; + } + + std::vector<Iterator*> tailing_iters; + if (FLAGS_use_tailing_iterator) { + if (db_.db != nullptr) { + tailing_iters.push_back(db_.db->NewIterator(options)); + } else { + for (const auto& db_with_cfh : multi_dbs_) { + tailing_iters.push_back(db_with_cfh.db->NewIterator(options)); + } + } + } + options.auto_prefix_mode = FLAGS_auto_prefix_mode; + + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + + std::unique_ptr<const char[]> upper_bound_key_guard; + Slice upper_bound = AllocateKey(&upper_bound_key_guard); + std::unique_ptr<const char[]> lower_bound_key_guard; + Slice lower_bound = AllocateKey(&lower_bound_key_guard); + + Duration duration(FLAGS_duration, reads_); + char value_buffer[256]; + while (!duration.Done(1)) { + int64_t seek_pos = thread->rand.Next() % FLAGS_num; + GenerateKeyFromIntForSeek(static_cast<uint64_t>(seek_pos), FLAGS_num, + &key); + if (FLAGS_max_scan_distance != 0) { + if (FLAGS_reverse_iterator) { + GenerateKeyFromInt( + static_cast<uint64_t>(std::max( + static_cast<int64_t>(0), seek_pos - FLAGS_max_scan_distance)), + FLAGS_num, &lower_bound); + options.iterate_lower_bound = &lower_bound; + } else { + auto min_num = + std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance); + GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num, + &upper_bound); + options.iterate_upper_bound = &upper_bound; + } + } else if (FLAGS_auto_prefix_mode && prefix_extractor_ && + !FLAGS_reverse_iterator) { + // Set upper bound to next prefix + auto mutable_upper_bound = const_cast<char*>(upper_bound.data()); + std::memcpy(mutable_upper_bound, key.data(), prefix_size_); + mutable_upper_bound[prefix_size_ - 1]++; + upper_bound = Slice(upper_bound.data(), prefix_size_); + options.iterate_upper_bound = &upper_bound; + } + + // Pick a Iterator to use + uint64_t db_idx_to_use = + (db_.db == nullptr) + ? (uint64_t{thread->rand.Next()} % multi_dbs_.size()) + : 0; + std::unique_ptr<Iterator> single_iter; + Iterator* iter_to_use; + if (FLAGS_use_tailing_iterator) { + iter_to_use = tailing_iters[db_idx_to_use]; + } else { + if (db_.db != nullptr) { + single_iter.reset(db_.db->NewIterator(options)); + } else { + single_iter.reset(multi_dbs_[db_idx_to_use].db->NewIterator(options)); + } + iter_to_use = single_iter.get(); + } + + iter_to_use->Seek(key); + read++; + if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) { + found++; + } + + for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) { + // Copy out iterator's value to make sure we read them. + Slice value = iter_to_use->value(); + memcpy(value_buffer, value.data(), + std::min(value.size(), sizeof(value_buffer))); + bytes += iter_to_use->key().size() + iter_to_use->value().size(); + + if (!FLAGS_reverse_iterator) { + iter_to_use->Next(); + } else { + iter_to_use->Prev(); + } + assert(iter_to_use->status().ok()); + } + + if (thread->shared->read_rate_limiter.get() != nullptr && + read % 256 == 255) { + thread->shared->read_rate_limiter->Request( + 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); + } + + thread->stats.FinishedOps(&db_, db_.db, 1, kSeek); + } + for (auto iter : tailing_iters) { + delete iter; + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found, + read); + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(msg); + } + + void SeekRandomWhileWriting(ThreadState* thread) { + if (thread->tid > 0) { + SeekRandom(thread); + } else { + BGWriter(thread, kWrite); + } + } + + void SeekRandomWhileMerging(ThreadState* thread) { + if (thread->tid > 0) { + SeekRandom(thread); + } else { + BGWriter(thread, kMerge); + } + } + + void DoDelete(ThreadState* thread, bool seq) { + WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0, + FLAGS_write_batch_protection_bytes_per_key, + user_timestamp_size_); + Duration duration(seq ? 0 : FLAGS_duration, deletes_); + int64_t i = 0; + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + std::unique_ptr<char[]> ts_guard; + Slice ts; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + } + + while (!duration.Done(entries_per_batch_)) { + DB* db = SelectDB(thread); + batch.Clear(); + for (int64_t j = 0; j < entries_per_batch_; ++j) { + const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num); + GenerateKeyFromInt(k, FLAGS_num, &key); + batch.Delete(key); + } + Status s; + if (user_timestamp_size_ > 0) { + ts = mock_app_clock_->Allocate(ts_guard.get()); + s = batch.UpdateTimestamps( + ts, [this](uint32_t) { return user_timestamp_size_; }); + if (!s.ok()) { + fprintf(stderr, "assign timestamp: %s\n", s.ToString().c_str()); + ErrorExit(); + } + } + s = db->Write(write_options_, &batch); + thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete); + if (!s.ok()) { + fprintf(stderr, "del error: %s\n", s.ToString().c_str()); + exit(1); + } + i += entries_per_batch_; + } + } + + void DeleteSeq(ThreadState* thread) { DoDelete(thread, true); } + + void DeleteRandom(ThreadState* thread) { DoDelete(thread, false); } + + void ReadWhileWriting(ThreadState* thread) { + if (thread->tid > 0) { + ReadRandom(thread); + } else { + BGWriter(thread, kWrite); + } + } + + void MultiReadWhileWriting(ThreadState* thread) { + if (thread->tid > 0) { + MultiReadRandom(thread); + } else { + BGWriter(thread, kWrite); + } + } + + void ReadWhileMerging(ThreadState* thread) { + if (thread->tid > 0) { + ReadRandom(thread); + } else { + BGWriter(thread, kMerge); + } + } + + void BGWriter(ThreadState* thread, enum OperationType write_merge) { + // Special thread that keeps writing until other threads are done. + RandomGenerator gen; + int64_t bytes = 0; + + std::unique_ptr<RateLimiter> write_rate_limiter; + if (FLAGS_benchmark_write_rate_limit > 0) { + write_rate_limiter.reset( + NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit)); + } + + // Don't merge stats from this thread with the readers. + thread->stats.SetExcludeFromMerge(); + + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + std::unique_ptr<char[]> ts_guard; + std::unique_ptr<const char[]> begin_key_guard; + Slice begin_key = AllocateKey(&begin_key_guard); + std::unique_ptr<const char[]> end_key_guard; + Slice end_key = AllocateKey(&end_key_guard); + uint64_t num_range_deletions = 0; + std::vector<std::unique_ptr<const char[]>> expanded_key_guards; + std::vector<Slice> expanded_keys; + if (FLAGS_expand_range_tombstones) { + expanded_key_guards.resize(range_tombstone_width_); + for (auto& expanded_key_guard : expanded_key_guards) { + expanded_keys.emplace_back(AllocateKey(&expanded_key_guard)); + } + } + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + } + uint32_t written = 0; + bool hint_printed = false; + + while (true) { + DB* db = SelectDB(thread); + { + MutexLock l(&thread->shared->mu); + if (FLAGS_finish_after_writes && written == writes_) { + fprintf(stderr, "Exiting the writer after %u writes...\n", written); + break; + } + if (thread->shared->num_done + 1 >= thread->shared->num_initialized) { + // Other threads have finished + if (FLAGS_finish_after_writes) { + // Wait for the writes to be finished + if (!hint_printed) { + fprintf(stderr, "Reads are finished. Have %d more writes to do\n", + static_cast<int>(writes_) - written); + hint_printed = true; + } + } else { + // Finish the write immediately + break; + } + } + } + + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + Status s; + + Slice val = gen.Generate(); + Slice ts; + if (user_timestamp_size_ > 0) { + ts = mock_app_clock_->Allocate(ts_guard.get()); + } + if (write_merge == kWrite) { + if (user_timestamp_size_ == 0) { + s = db->Put(write_options_, key, val); + } else { + s = db->Put(write_options_, key, ts, val); + } + } else { + s = db->Merge(write_options_, key, val); + } + // Restore write_options_ + written++; + + if (!s.ok()) { + fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str()); + exit(1); + } + bytes += key.size() + val.size() + user_timestamp_size_; + thread->stats.FinishedOps(&db_, db_.db, 1, kWrite); + + if (FLAGS_benchmark_write_rate_limit > 0) { + write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH, + nullptr /* stats */, + RateLimiter::OpType::kWrite); + } + + if (writes_per_range_tombstone_ > 0 && + written > writes_before_delete_range_ && + (written - writes_before_delete_range_) / + writes_per_range_tombstone_ <= + max_num_range_tombstones_ && + (written - writes_before_delete_range_) % + writes_per_range_tombstone_ == + 0) { + num_range_deletions++; + int64_t begin_num = thread->rand.Next() % FLAGS_num; + if (FLAGS_expand_range_tombstones) { + for (int64_t offset = 0; offset < range_tombstone_width_; ++offset) { + GenerateKeyFromInt(begin_num + offset, FLAGS_num, + &expanded_keys[offset]); + if (!db->Delete(write_options_, expanded_keys[offset]).ok()) { + fprintf(stderr, "delete error: %s\n", s.ToString().c_str()); + exit(1); + } + } + } else { + GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key); + GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num, + &end_key); + if (!db->DeleteRange(write_options_, db->DefaultColumnFamily(), + begin_key, end_key) + .ok()) { + fprintf(stderr, "deleterange error: %s\n", s.ToString().c_str()); + exit(1); + } + } + thread->stats.FinishedOps(&db_, db_.db, 1, kWrite); + // TODO: DeleteRange is not included in calculcation of bytes/rate + // limiter request + } + } + if (num_range_deletions > 0) { + std::cout << "Number of range deletions: " << num_range_deletions + << std::endl; + } + thread->stats.AddBytes(bytes); + } + + void ReadWhileScanning(ThreadState* thread) { + if (thread->tid > 0) { + ReadRandom(thread); + } else { + BGScan(thread); + } + } + + void BGScan(ThreadState* thread) { + if (FLAGS_num_multi_db > 0) { + fprintf(stderr, "Not supporting multiple DBs.\n"); + abort(); + } + assert(db_.db != nullptr); + ReadOptions read_options = read_options_; + std::unique_ptr<char[]> ts_guard; + Slice ts; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); + read_options.timestamp = &ts; + } + Iterator* iter = db_.db->NewIterator(read_options); + + fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_); + Duration duration(FLAGS_duration, reads_); + uint64_t num_seek_to_first = 0; + uint64_t num_next = 0; + while (!duration.Done(1)) { + if (!iter->Valid()) { + iter->SeekToFirst(); + num_seek_to_first++; + } else if (!iter->status().ok()) { + fprintf(stderr, "Iterator error: %s\n", + iter->status().ToString().c_str()); + abort(); + } else { + iter->Next(); + num_next++; + } + + thread->stats.FinishedOps(&db_, db_.db, 1, kSeek); + } + delete iter; + } + + // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V) + // in DB atomically i.e in a single batch. Also refer GetMany. + Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key, + const Slice& value) { + std::string suffixes[3] = {"2", "1", "0"}; + std::string keys[3]; + + WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0, + FLAGS_write_batch_protection_bytes_per_key, + user_timestamp_size_); + Status s; + for (int i = 0; i < 3; i++) { + keys[i] = key.ToString() + suffixes[i]; + batch.Put(keys[i], value); + } + + std::unique_ptr<char[]> ts_guard; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + Slice ts = mock_app_clock_->Allocate(ts_guard.get()); + s = batch.UpdateTimestamps( + ts, [this](uint32_t) { return user_timestamp_size_; }); + if (!s.ok()) { + fprintf(stderr, "assign timestamp to batch: %s\n", + s.ToString().c_str()); + ErrorExit(); + } + } + + s = db->Write(writeoptions, &batch); + return s; + } + + // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V) + // in DB atomically i.e in a single batch. Also refer GetMany. + Status DeleteMany(DB* db, const WriteOptions& writeoptions, + const Slice& key) { + std::string suffixes[3] = {"1", "2", "0"}; + std::string keys[3]; + + WriteBatch batch(0, 0, FLAGS_write_batch_protection_bytes_per_key, + user_timestamp_size_); + Status s; + for (int i = 0; i < 3; i++) { + keys[i] = key.ToString() + suffixes[i]; + batch.Delete(keys[i]); + } + + std::unique_ptr<char[]> ts_guard; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + Slice ts = mock_app_clock_->Allocate(ts_guard.get()); + s = batch.UpdateTimestamps( + ts, [this](uint32_t) { return user_timestamp_size_; }); + if (!s.ok()) { + fprintf(stderr, "assign timestamp to batch: %s\n", + s.ToString().c_str()); + ErrorExit(); + } + } + + s = db->Write(writeoptions, &batch); + return s; + } + + // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2" + // in the same snapshot, and verifies that all the values are identical. + // ASSUMES that PutMany was used to put (K, V) into the DB. + Status GetMany(DB* db, const Slice& key, std::string* value) { + std::string suffixes[3] = {"0", "1", "2"}; + std::string keys[3]; + Slice key_slices[3]; + std::string values[3]; + ReadOptions readoptionscopy = read_options_; + + std::unique_ptr<char[]> ts_guard; + Slice ts; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + ts = mock_app_clock_->Allocate(ts_guard.get()); + readoptionscopy.timestamp = &ts; + } + + readoptionscopy.snapshot = db->GetSnapshot(); + Status s; + for (int i = 0; i < 3; i++) { + keys[i] = key.ToString() + suffixes[i]; + key_slices[i] = keys[i]; + s = db->Get(readoptionscopy, key_slices[i], value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + values[i] = ""; + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (s.IsNotFound()) { + values[i] = ""; + } else { + values[i] = *value; + } + } + db->ReleaseSnapshot(readoptionscopy.snapshot); + + if ((values[0] != values[1]) || (values[1] != values[2])) { + fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n", + key.ToString().c_str(), values[0].c_str(), values[1].c_str(), + values[2].c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } + + return s; + } + + // Differs from readrandomwriterandom in the following ways: + // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs. + // (b) Does deletes as well (per FLAGS_deletepercent) + // (c) In order to achieve high % of 'found' during lookups, and to do + // multiple writes (including puts and deletes) it uses upto + // FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys. + // (d) Does not have a MultiGet option. + void RandomWithVerify(ThreadState* thread) { + RandomGenerator gen; + std::string value; + int64_t found = 0; + int get_weight = 0; + int put_weight = 0; + int delete_weight = 0; + int64_t gets_done = 0; + int64_t puts_done = 0; + int64_t deletes_done = 0; + + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + + // the number of iterations is the larger of read_ or write_ + for (int64_t i = 0; i < readwrites_; i++) { + DB* db = SelectDB(thread); + if (get_weight == 0 && put_weight == 0 && delete_weight == 0) { + // one batch completed, reinitialize for next batch + get_weight = FLAGS_readwritepercent; + delete_weight = FLAGS_deletepercent; + put_weight = 100 - get_weight - delete_weight; + } + GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct, + FLAGS_numdistinct, &key); + if (get_weight > 0) { + // do all the gets first + Status s = GetMany(db, key, &value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "getmany error: %s\n", s.ToString().c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (!s.IsNotFound()) { + found++; + } + get_weight--; + gets_done++; + thread->stats.FinishedOps(&db_, db_.db, 1, kRead); + } else if (put_weight > 0) { + // then do all the corresponding number of puts + // for all the gets we have done earlier + Status s = PutMany(db, write_options_, key, gen.Generate()); + if (!s.ok()) { + fprintf(stderr, "putmany error: %s\n", s.ToString().c_str()); + exit(1); + } + put_weight--; + puts_done++; + thread->stats.FinishedOps(&db_, db_.db, 1, kWrite); + } else if (delete_weight > 0) { + Status s = DeleteMany(db, write_options_, key); + if (!s.ok()) { + fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str()); + exit(1); + } + delete_weight--; + deletes_done++; + thread->stats.FinishedOps(&db_, db_.db, 1, kDelete); + } + } + char msg[128]; + snprintf(msg, sizeof(msg), + "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" PRIu64 + " found:%" PRIu64 ")", + gets_done, puts_done, deletes_done, readwrites_, found); + thread->stats.AddMessage(msg); + } + + // This is different from ReadWhileWriting because it does not use + // an extra thread. + void ReadRandomWriteRandom(ThreadState* thread) { + ReadOptions options = read_options_; + RandomGenerator gen; + std::string value; + int64_t found = 0; + int get_weight = 0; + int put_weight = 0; + int64_t reads_done = 0; + int64_t writes_done = 0; + Duration duration(FLAGS_duration, readwrites_); + + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + + std::unique_ptr<char[]> ts_guard; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + } + + // the number of iterations is the larger of read_ or write_ + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + if (get_weight == 0 && put_weight == 0) { + // one batch completed, reinitialize for next batch + get_weight = FLAGS_readwritepercent; + put_weight = 100 - get_weight; + } + if (get_weight > 0) { + // do all the gets first + Slice ts; + if (user_timestamp_size_ > 0) { + ts = mock_app_clock_->GetTimestampForRead(thread->rand, + ts_guard.get()); + options.timestamp = &ts; + } + Status s = db->Get(options, key, &value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (!s.IsNotFound()) { + found++; + } + get_weight--; + reads_done++; + thread->stats.FinishedOps(nullptr, db, 1, kRead); + } else if (put_weight > 0) { + // then do all the corresponding number of puts + // for all the gets we have done earlier + Status s; + if (user_timestamp_size_ > 0) { + Slice ts = mock_app_clock_->Allocate(ts_guard.get()); + s = db->Put(write_options_, key, ts, gen.Generate()); + } else { + s = db->Put(write_options_, key, gen.Generate()); + } + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + ErrorExit(); + } + put_weight--; + writes_done++; + thread->stats.FinishedOps(nullptr, db, 1, kWrite); + } + } + char msg[100]; + snprintf(msg, sizeof(msg), + "( reads:%" PRIu64 " writes:%" PRIu64 " total:%" PRIu64 + " found:%" PRIu64 ")", + reads_done, writes_done, readwrites_, found); + thread->stats.AddMessage(msg); + } + + // + // Read-modify-write for random keys + void UpdateRandom(ThreadState* thread) { + ReadOptions options = read_options_; + RandomGenerator gen; + std::string value; + int64_t found = 0; + int64_t bytes = 0; + Duration duration(FLAGS_duration, readwrites_); + + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + std::unique_ptr<char[]> ts_guard; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + } + // the number of iterations is the larger of read_ or write_ + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + Slice ts; + if (user_timestamp_size_ > 0) { + // Read with newest timestamp because we are doing rmw. + ts = mock_app_clock_->Allocate(ts_guard.get()); + options.timestamp = &ts; + } + + auto status = db->Get(options, key, &value); + if (status.ok()) { + ++found; + bytes += key.size() + value.size() + user_timestamp_size_; + } else if (!status.IsNotFound()) { + fprintf(stderr, "Get returned an error: %s\n", + status.ToString().c_str()); + abort(); + } + + if (thread->shared->write_rate_limiter) { + thread->shared->write_rate_limiter->Request( + key.size() + value.size(), Env::IO_HIGH, nullptr /*stats*/, + RateLimiter::OpType::kWrite); + } + + Slice val = gen.Generate(); + Status s; + if (user_timestamp_size_ > 0) { + ts = mock_app_clock_->Allocate(ts_guard.get()); + s = db->Put(write_options_, key, ts, val); + } else { + s = db->Put(write_options_, key, val); + } + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + bytes += key.size() + val.size() + user_timestamp_size_; + thread->stats.FinishedOps(nullptr, db, 1, kUpdate); + } + char msg[100]; + snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")", + readwrites_, found); + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(msg); + } + + // Read-XOR-write for random keys. Xors the existing value with a randomly + // generated value, and stores the result. Assuming A in the array of bytes + // representing the existing value, we generate an array B of the same size, + // then compute C = A^B as C[i]=A[i]^B[i], and store C + void XORUpdateRandom(ThreadState* thread) { + ReadOptions options = read_options_; + RandomGenerator gen; + std::string existing_value; + int64_t found = 0; + Duration duration(FLAGS_duration, readwrites_); + + BytesXOROperator xor_operator; + + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + std::unique_ptr<char[]> ts_guard; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + } + // the number of iterations is the larger of read_ or write_ + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + Slice ts; + if (user_timestamp_size_ > 0) { + ts = mock_app_clock_->Allocate(ts_guard.get()); + options.timestamp = &ts; + } + + auto status = db->Get(options, key, &existing_value); + if (status.ok()) { + ++found; + } else if (!status.IsNotFound()) { + fprintf(stderr, "Get returned an error: %s\n", + status.ToString().c_str()); + exit(1); + } + + Slice value = + gen.Generate(static_cast<unsigned int>(existing_value.size())); + std::string new_value; + + if (status.ok()) { + Slice existing_value_slice = Slice(existing_value); + xor_operator.XOR(&existing_value_slice, value, &new_value); + } else { + xor_operator.XOR(nullptr, value, &new_value); + } + + Status s; + if (user_timestamp_size_ > 0) { + ts = mock_app_clock_->Allocate(ts_guard.get()); + s = db->Put(write_options_, key, ts, Slice(new_value)); + } else { + s = db->Put(write_options_, key, Slice(new_value)); + } + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + ErrorExit(); + } + thread->stats.FinishedOps(nullptr, db, 1); + } + char msg[100]; + snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")", + readwrites_, found); + thread->stats.AddMessage(msg); + } + + // Read-modify-write for random keys. + // Each operation causes the key grow by value_size (simulating an append). + // Generally used for benchmarking against merges of similar type + void AppendRandom(ThreadState* thread) { + ReadOptions options = read_options_; + RandomGenerator gen; + std::string value; + int64_t found = 0; + int64_t bytes = 0; + + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + std::unique_ptr<char[]> ts_guard; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + } + // The number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + Slice ts; + if (user_timestamp_size_ > 0) { + ts = mock_app_clock_->Allocate(ts_guard.get()); + options.timestamp = &ts; + } + + auto status = db->Get(options, key, &value); + if (status.ok()) { + ++found; + bytes += key.size() + value.size() + user_timestamp_size_; + } else if (!status.IsNotFound()) { + fprintf(stderr, "Get returned an error: %s\n", + status.ToString().c_str()); + abort(); + } else { + // If not existing, then just assume an empty string of data + value.clear(); + } + + // Update the value (by appending data) + Slice operand = gen.Generate(); + if (value.size() > 0) { + // Use a delimiter to match the semantics for StringAppendOperator + value.append(1, ','); + } + value.append(operand.data(), operand.size()); + + Status s; + if (user_timestamp_size_ > 0) { + ts = mock_app_clock_->Allocate(ts_guard.get()); + s = db->Put(write_options_, key, ts, value); + } else { + // Write back to the database + s = db->Put(write_options_, key, value); + } + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + ErrorExit(); + } + bytes += key.size() + value.size() + user_timestamp_size_; + thread->stats.FinishedOps(nullptr, db, 1, kUpdate); + } + + char msg[100]; + snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")", + readwrites_, found); + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(msg); + } + + // Read-modify-write for random keys (using MergeOperator) + // The merge operator to use should be defined by FLAGS_merge_operator + // Adjust FLAGS_value_size so that the keys are reasonable for this operator + // Assumes that the merge operator is non-null (i.e.: is well-defined) + // + // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8 + // to simulate random additions over 64-bit integers using merge. + // + // The number of merges on the same key can be controlled by adjusting + // FLAGS_merge_keys. + void MergeRandom(ThreadState* thread) { + RandomGenerator gen; + int64_t bytes = 0; + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + // The number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + while (!duration.Done(1)) { + DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread); + int64_t key_rand = thread->rand.Next() % merge_keys_; + GenerateKeyFromInt(key_rand, merge_keys_, &key); + + Status s; + Slice val = gen.Generate(); + if (FLAGS_num_column_families > 1) { + s = db_with_cfh->db->Merge(write_options_, + db_with_cfh->GetCfh(key_rand), key, val); + } else { + s = db_with_cfh->db->Merge( + write_options_, db_with_cfh->db->DefaultColumnFamily(), key, val); + } + + if (!s.ok()) { + fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); + exit(1); + } + bytes += key.size() + val.size(); + thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge); + } + + // Print some statistics + char msg[100]; + snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_); + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(msg); + } + + // Read and merge random keys. The amount of reads and merges are controlled + // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct + // keys (and thus also the number of reads and merges on the same key) can be + // adjusted with FLAGS_merge_keys. + // + // As with MergeRandom, the merge operator to use should be defined by + // FLAGS_merge_operator. + void ReadRandomMergeRandom(ThreadState* thread) { + RandomGenerator gen; + std::string value; + int64_t num_hits = 0; + int64_t num_gets = 0; + int64_t num_merges = 0; + size_t max_length = 0; + + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + // the number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key); + + bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent; + + if (do_merge) { + Status s = db->Merge(write_options_, key, gen.Generate()); + if (!s.ok()) { + fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); + exit(1); + } + num_merges++; + thread->stats.FinishedOps(nullptr, db, 1, kMerge); + } else { + Status s = db->Get(read_options_, key, &value); + if (value.length() > max_length) max_length = value.length(); + + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (!s.IsNotFound()) { + num_hits++; + } + num_gets++; + thread->stats.FinishedOps(nullptr, db, 1, kRead); + } + } + + char msg[100]; + snprintf(msg, sizeof(msg), + "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64 + " hits:%" PRIu64 " maxlength:%" ROCKSDB_PRIszt ")", + num_gets, num_merges, readwrites_, num_hits, max_length); + thread->stats.AddMessage(msg); + } + + void WriteSeqSeekSeq(ThreadState* thread) { + writes_ = FLAGS_num; + DoWrite(thread, SEQUENTIAL); + // exclude writes from the ops/sec calculation + thread->stats.Start(thread->tid); + + DB* db = SelectDB(thread); + ReadOptions read_opts = read_options_; + std::unique_ptr<char[]> ts_guard; + Slice ts; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get()); + read_opts.timestamp = &ts; + } + std::unique_ptr<Iterator> iter(db->NewIterator(read_opts)); + + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + for (int64_t i = 0; i < FLAGS_num; ++i) { + GenerateKeyFromInt(i, FLAGS_num, &key); + iter->Seek(key); + assert(iter->Valid() && iter->key() == key); + thread->stats.FinishedOps(nullptr, db, 1, kSeek); + + for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) { + if (!FLAGS_reverse_iterator) { + iter->Next(); + } else { + iter->Prev(); + } + GenerateKeyFromInt(++i, FLAGS_num, &key); + assert(iter->Valid() && iter->key() == key); + thread->stats.FinishedOps(nullptr, db, 1, kSeek); + } + + iter->Seek(key); + assert(iter->Valid() && iter->key() == key); + thread->stats.FinishedOps(nullptr, db, 1, kSeek); + } + } + + bool binary_search(std::vector<int>& data, int start, int end, int key) { + if (data.empty()) return false; + if (start > end) return false; + int mid = start + (end - start) / 2; + if (mid > static_cast<int>(data.size()) - 1) return false; + if (data[mid] == key) { + return true; + } else if (data[mid] > key) { + return binary_search(data, start, mid - 1, key); + } else { + return binary_search(data, mid + 1, end, key); + } + } + + // Does a bunch of merge operations for a key(key1) where the merge operand + // is a sorted list. Next performance comparison is done between doing a Get + // for key1 followed by searching for another key(key2) in the large sorted + // list vs calling GetMergeOperands for key1 and then searching for the key2 + // in all the sorted sub-lists. Later case is expected to be a lot faster. + void GetMergeOperands(ThreadState* thread) { + DB* db = SelectDB(thread); + const int kTotalValues = 100000; + const int kListSize = 100; + std::string key = "my_key"; + std::string value; + + for (int i = 1; i < kTotalValues; i++) { + if (i % kListSize == 0) { + // Remove trailing ',' + value.pop_back(); + db->Merge(WriteOptions(), key, value); + value.clear(); + } else { + value.append(std::to_string(i)).append(","); + } + } + + SortList s; + std::vector<int> data; + // This value can be experimented with and it will demonstrate the + // perf difference between doing a Get and searching for lookup_key in the + // resultant large sorted list vs doing GetMergeOperands and searching + // for lookup_key within this resultant sorted sub-lists. + int lookup_key = 1; + + // Get API call + std::cout << "--- Get API call --- \n"; + PinnableSlice p_slice; + uint64_t st = FLAGS_env->NowNanos(); + db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice); + s.MakeVector(data, p_slice); + bool found = + binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key); + std::cout << "Found key? " << std::to_string(found) << "\n"; + uint64_t sp = FLAGS_env->NowNanos(); + std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n"; + std::string* dat_ = p_slice.GetSelf(); + std::cout << "Sample data from Get API call: " << dat_->substr(0, 10) + << "\n"; + data.clear(); + + // GetMergeOperands API call + std::cout << "--- GetMergeOperands API --- \n"; + std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1); + st = FLAGS_env->NowNanos(); + int number_of_operands = 0; + GetMergeOperandsOptions get_merge_operands_options; + get_merge_operands_options.expected_max_number_of_operands = + (kTotalValues / 100) + 1; + db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key, + a_slice.data(), &get_merge_operands_options, + &number_of_operands); + for (PinnableSlice& psl : a_slice) { + s.MakeVector(data, psl); + found = + binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key); + data.clear(); + if (found) break; + } + std::cout << "Found key? " << std::to_string(found) << "\n"; + sp = FLAGS_env->NowNanos(); + std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0 + << " seconds \n"; + int to_print = 0; + std::cout << "Sample data from GetMergeOperands API call: "; + for (PinnableSlice& psl : a_slice) { + std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n"; + if (to_print++ > 2) break; + } + } + +#ifndef ROCKSDB_LITE + void VerifyChecksum(ThreadState* thread) { + DB* db = SelectDB(thread); + ReadOptions ro; + ro.adaptive_readahead = FLAGS_adaptive_readahead; + ro.async_io = FLAGS_async_io; + ro.rate_limiter_priority = + FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL; + ro.readahead_size = FLAGS_readahead_size; + Status s = db->VerifyChecksum(ro); + if (!s.ok()) { + fprintf(stderr, "VerifyChecksum() failed: %s\n", s.ToString().c_str()); + exit(1); + } + } + + void VerifyFileChecksums(ThreadState* thread) { + DB* db = SelectDB(thread); + ReadOptions ro; + ro.adaptive_readahead = FLAGS_adaptive_readahead; + ro.async_io = FLAGS_async_io; + ro.rate_limiter_priority = + FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL; + ro.readahead_size = FLAGS_readahead_size; + Status s = db->VerifyFileChecksums(ro); + if (!s.ok()) { + fprintf(stderr, "VerifyFileChecksums() failed: %s\n", + s.ToString().c_str()); + exit(1); + } + } + + // This benchmark stress tests Transactions. For a given --duration (or + // total number of --writes, a Transaction will perform a read-modify-write + // to increment the value of a key in each of N(--transaction-sets) sets of + // keys (where each set has --num keys). If --threads is set, this will be + // done in parallel. + // + // To test transactions, use --transaction_db=true. Not setting this + // parameter + // will run the same benchmark without transactions. + // + // RandomTransactionVerify() will then validate the correctness of the results + // by checking if the sum of all keys in each set is the same. + void RandomTransaction(ThreadState* thread) { + Duration duration(FLAGS_duration, readwrites_); + uint16_t num_prefix_ranges = static_cast<uint16_t>(FLAGS_transaction_sets); + uint64_t transactions_done = 0; + + if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) { + fprintf(stderr, "invalid value for transaction_sets\n"); + abort(); + } + + TransactionOptions txn_options; + txn_options.lock_timeout = FLAGS_transaction_lock_timeout; + txn_options.set_snapshot = FLAGS_transaction_set_snapshot; + + RandomTransactionInserter inserter(&thread->rand, write_options_, + read_options_, FLAGS_num, + num_prefix_ranges); + + if (FLAGS_num_multi_db > 1) { + fprintf(stderr, + "Cannot run RandomTransaction benchmark with " + "FLAGS_multi_db > 1."); + abort(); + } + + while (!duration.Done(1)) { + bool success; + + // RandomTransactionInserter will attempt to insert a key for each + // # of FLAGS_transaction_sets + if (FLAGS_optimistic_transaction_db) { + success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db); + } else if (FLAGS_transaction_db) { + TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db); + success = inserter.TransactionDBInsert(txn_db, txn_options); + } else { + success = inserter.DBInsert(db_.db); + } + + if (!success) { + fprintf(stderr, "Unexpected error: %s\n", + inserter.GetLastStatus().ToString().c_str()); + abort(); + } + + thread->stats.FinishedOps(nullptr, db_.db, 1, kOthers); + transactions_done++; + } + + char msg[100]; + if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) { + snprintf(msg, sizeof(msg), + "( transactions:%" PRIu64 " aborts:%" PRIu64 ")", + transactions_done, inserter.GetFailureCount()); + } else { + snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done); + } + thread->stats.AddMessage(msg); + thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted())); + } + + // Verifies consistency of data after RandomTransaction() has been run. + // Since each iteration of RandomTransaction() incremented a key in each set + // by the same value, the sum of the keys in each set should be the same. + void RandomTransactionVerify() { + if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) { + // transactions not used, nothing to verify. + return; + } + + Status s = RandomTransactionInserter::Verify( + db_.db, static_cast<uint16_t>(FLAGS_transaction_sets)); + + if (s.ok()) { + fprintf(stdout, "RandomTransactionVerify Success.\n"); + } else { + fprintf(stdout, "RandomTransactionVerify FAILED!!\n"); + } + } +#endif // ROCKSDB_LITE + + // Writes and deletes random keys without overwriting keys. + // + // This benchmark is intended to partially replicate the behavior of MyRocks + // secondary indices: All data is stored in keys and updates happen by + // deleting the old version of the key and inserting the new version. + void RandomReplaceKeys(ThreadState* thread) { + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + std::unique_ptr<char[]> ts_guard; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + } + std::vector<uint32_t> counters(FLAGS_numdistinct, 0); + size_t max_counter = 50; + RandomGenerator gen; + + Status s; + DB* db = SelectDB(thread); + for (int64_t i = 0; i < FLAGS_numdistinct; i++) { + GenerateKeyFromInt(i * max_counter, FLAGS_num, &key); + if (user_timestamp_size_ > 0) { + Slice ts = mock_app_clock_->Allocate(ts_guard.get()); + s = db->Put(write_options_, key, ts, gen.Generate()); + } else { + s = db->Put(write_options_, key, gen.Generate()); + } + if (!s.ok()) { + fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str()); + exit(1); + } + } + + db->GetSnapshot(); + + std::default_random_engine generator; + std::normal_distribution<double> distribution(FLAGS_numdistinct / 2.0, + FLAGS_stddev); + Duration duration(FLAGS_duration, FLAGS_num); + while (!duration.Done(1)) { + int64_t rnd_id = static_cast<int64_t>(distribution(generator)); + int64_t key_id = std::max(std::min(FLAGS_numdistinct - 1, rnd_id), + static_cast<int64_t>(0)); + GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num, + &key); + if (user_timestamp_size_ > 0) { + Slice ts = mock_app_clock_->Allocate(ts_guard.get()); + s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key, ts) + : db->Delete(write_options_, key, ts); + } else { + s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key) + : db->Delete(write_options_, key); + } + if (s.ok()) { + counters[key_id] = (counters[key_id] + 1) % max_counter; + GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num, + &key); + if (user_timestamp_size_ > 0) { + Slice ts = mock_app_clock_->Allocate(ts_guard.get()); + s = db->Put(write_options_, key, ts, Slice()); + } else { + s = db->Put(write_options_, key, Slice()); + } + } + + if (!s.ok()) { + fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str()); + exit(1); + } + + thread->stats.FinishedOps(nullptr, db, 1, kOthers); + } + + char msg[200]; + snprintf(msg, sizeof(msg), + "use single deletes: %d, " + "standard deviation: %lf\n", + FLAGS_use_single_deletes, FLAGS_stddev); + thread->stats.AddMessage(msg); + } + + void TimeSeriesReadOrDelete(ThreadState* thread, bool do_deletion) { + int64_t read = 0; + int64_t found = 0; + int64_t bytes = 0; + + Iterator* iter = nullptr; + // Only work on single database + assert(db_.db != nullptr); + iter = db_.db->NewIterator(read_options_); + + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + + char value_buffer[256]; + while (true) { + { + MutexLock l(&thread->shared->mu); + if (thread->shared->num_done >= 1) { + // Write thread have finished + break; + } + } + if (!FLAGS_use_tailing_iterator) { + delete iter; + iter = db_.db->NewIterator(read_options_); + } + // Pick a Iterator to use + + int64_t key_id = thread->rand.Next() % FLAGS_key_id_range; + GenerateKeyFromInt(key_id, FLAGS_num, &key); + // Reset last 8 bytes to 0 + char* start = const_cast<char*>(key.data()); + start += key.size() - 8; + memset(start, 0, 8); + ++read; + + bool key_found = false; + // Seek the prefix + for (iter->Seek(key); iter->Valid() && iter->key().starts_with(key); + iter->Next()) { + key_found = true; + // Copy out iterator's value to make sure we read them. + if (do_deletion) { + bytes += iter->key().size(); + if (KeyExpired(timestamp_emulator_.get(), iter->key())) { + thread->stats.FinishedOps(&db_, db_.db, 1, kDelete); + db_.db->Delete(write_options_, iter->key()); + } else { + break; + } + } else { + bytes += iter->key().size() + iter->value().size(); + thread->stats.FinishedOps(&db_, db_.db, 1, kRead); + Slice value = iter->value(); + memcpy(value_buffer, value.data(), + std::min(value.size(), sizeof(value_buffer))); + + assert(iter->status().ok()); + } + } + found += key_found; + + if (thread->shared->read_rate_limiter.get() != nullptr) { + thread->shared->read_rate_limiter->Request( + 1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); + } + } + delete iter; + + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found, + read); + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(msg); + } + + void TimeSeriesWrite(ThreadState* thread) { + // Special thread that keeps writing until other threads are done. + RandomGenerator gen; + int64_t bytes = 0; + + // Don't merge stats from this thread with the readers. + thread->stats.SetExcludeFromMerge(); + + std::unique_ptr<RateLimiter> write_rate_limiter; + if (FLAGS_benchmark_write_rate_limit > 0) { + write_rate_limiter.reset( + NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit)); + } + + std::unique_ptr<const char[]> key_guard; + Slice key = AllocateKey(&key_guard); + + Duration duration(FLAGS_duration, writes_); + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + + uint64_t key_id = thread->rand.Next() % FLAGS_key_id_range; + // Write key id + GenerateKeyFromInt(key_id, FLAGS_num, &key); + // Write timestamp + + char* start = const_cast<char*>(key.data()); + char* pos = start + 8; + int bytes_to_fill = + std::min(key_size_ - static_cast<int>(pos - start), 8); + uint64_t timestamp_value = timestamp_emulator_->Get(); + if (port::kLittleEndian) { + for (int i = 0; i < bytes_to_fill; ++i) { + pos[i] = (timestamp_value >> ((bytes_to_fill - i - 1) << 3)) & 0xFF; + } + } else { + memcpy(pos, static_cast<void*>(×tamp_value), bytes_to_fill); + } + + timestamp_emulator_->Inc(); + + Status s; + Slice val = gen.Generate(); + s = db->Put(write_options_, key, val); + + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + ErrorExit(); + } + bytes = key.size() + val.size(); + thread->stats.FinishedOps(&db_, db_.db, 1, kWrite); + thread->stats.AddBytes(bytes); + + if (FLAGS_benchmark_write_rate_limit > 0) { + write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH, + nullptr /* stats */, + RateLimiter::OpType::kWrite); + } + } + } + + void TimeSeries(ThreadState* thread) { + if (thread->tid > 0) { + bool do_deletion = FLAGS_expire_style == "delete" && + thread->tid <= FLAGS_num_deletion_threads; + TimeSeriesReadOrDelete(thread, do_deletion); + } else { + TimeSeriesWrite(thread); + thread->stats.Stop(); + thread->stats.Report("timeseries write"); + } + } + + void Compact(ThreadState* thread) { + DB* db = SelectDB(thread); + CompactRangeOptions cro; + cro.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + db->CompactRange(cro, nullptr, nullptr); + } + + void CompactAll() { + if (db_.db != nullptr) { + db_.db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } + for (const auto& db_with_cfh : multi_dbs_) { + db_with_cfh.db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } + } + +#ifndef ROCKSDB_LITE + void WaitForCompactionHelper(DBWithColumnFamilies& db) { + // This is an imperfect way of waiting for compaction. The loop and sleep + // is done because a thread that finishes a compaction job should get a + // chance to pickup a new compaction job. + + std::vector<std::string> keys = {DB::Properties::kMemTableFlushPending, + DB::Properties::kNumRunningFlushes, + DB::Properties::kCompactionPending, + DB::Properties::kNumRunningCompactions}; + + fprintf(stdout, "waitforcompaction(%s): started\n", + db.db->GetName().c_str()); + + while (true) { + bool retry = false; + + for (const auto& k : keys) { + uint64_t v; + if (!db.db->GetIntProperty(k, &v)) { + fprintf(stderr, "waitforcompaction(%s): GetIntProperty(%s) failed\n", + db.db->GetName().c_str(), k.c_str()); + exit(1); + } else if (v > 0) { + fprintf(stdout, + "waitforcompaction(%s): active(%s). Sleep 10 seconds\n", + db.db->GetName().c_str(), k.c_str()); + FLAGS_env->SleepForMicroseconds(10 * 1000000); + retry = true; + break; + } + } + + if (!retry) { + fprintf(stdout, "waitforcompaction(%s): finished\n", + db.db->GetName().c_str()); + return; + } + } + } + + void WaitForCompaction() { + // Give background threads a chance to wake + FLAGS_env->SleepForMicroseconds(5 * 1000000); + + // I am skeptical that this check race free. I hope that checking twice + // reduces the chance. + if (db_.db != nullptr) { + WaitForCompactionHelper(db_); + WaitForCompactionHelper(db_); + } else { + for (auto& db_with_cfh : multi_dbs_) { + WaitForCompactionHelper(db_with_cfh); + WaitForCompactionHelper(db_with_cfh); + } + } + } + + bool CompactLevelHelper(DBWithColumnFamilies& db_with_cfh, int from_level) { + std::vector<LiveFileMetaData> files; + db_with_cfh.db->GetLiveFilesMetaData(&files); + + assert(from_level == 0 || from_level == 1); + + int real_from_level = from_level; + if (real_from_level > 0) { + // With dynamic leveled compaction the first level with data beyond L0 + // might not be L1. + real_from_level = std::numeric_limits<int>::max(); + + for (auto& f : files) { + if (f.level > 0 && f.level < real_from_level) real_from_level = f.level; + } + + if (real_from_level == std::numeric_limits<int>::max()) { + fprintf(stdout, "compact%d found 0 files to compact\n", from_level); + return true; + } + } + + // The goal is to compact from from_level to the level that follows it, + // and with dynamic leveled compaction the next level might not be + // real_from_level+1 + int next_level = std::numeric_limits<int>::max(); + + std::vector<std::string> files_to_compact; + for (auto& f : files) { + if (f.level == real_from_level) + files_to_compact.push_back(f.name); + else if (f.level > real_from_level && f.level < next_level) + next_level = f.level; + } + + if (files_to_compact.empty()) { + fprintf(stdout, "compact%d found 0 files to compact\n", from_level); + return true; + } else if (next_level == std::numeric_limits<int>::max()) { + // There is no data beyond real_from_level. So we are done. + fprintf(stdout, "compact%d found no data beyond L%d\n", from_level, + real_from_level); + return true; + } + + fprintf(stdout, "compact%d found %d files to compact from L%d to L%d\n", + from_level, static_cast<int>(files_to_compact.size()), + real_from_level, next_level); + + ROCKSDB_NAMESPACE::CompactionOptions options; + // Lets RocksDB use the configured compression for this level + options.compression = ROCKSDB_NAMESPACE::kDisableCompressionOption; + + ROCKSDB_NAMESPACE::ColumnFamilyDescriptor cfDesc; + db_with_cfh.db->DefaultColumnFamily()->GetDescriptor(&cfDesc); + options.output_file_size_limit = cfDesc.options.target_file_size_base; + + Status status = + db_with_cfh.db->CompactFiles(options, files_to_compact, next_level); + if (!status.ok()) { + // This can fail for valid reasons including the operation was aborted + // or a filename is invalid because background compaction removed it. + // Having read the current cases for which an error is raised I prefer + // not to figure out whether an exception should be thrown here. + fprintf(stderr, "compact%d CompactFiles failed: %s\n", from_level, + status.ToString().c_str()); + return false; + } + return true; + } + + void CompactLevel(int from_level) { + if (db_.db != nullptr) { + while (!CompactLevelHelper(db_, from_level)) WaitForCompaction(); + } + for (auto& db_with_cfh : multi_dbs_) { + while (!CompactLevelHelper(db_with_cfh, from_level)) WaitForCompaction(); + } + } +#endif + + void Flush() { + FlushOptions flush_opt; + flush_opt.wait = true; + + if (db_.db != nullptr) { + Status s; + if (FLAGS_num_column_families > 1) { + s = db_.db->Flush(flush_opt, db_.cfh); + } else { + s = db_.db->Flush(flush_opt, db_.db->DefaultColumnFamily()); + } + + if (!s.ok()) { + fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str()); + exit(1); + } + } else { + for (const auto& db_with_cfh : multi_dbs_) { + Status s; + if (FLAGS_num_column_families > 1) { + s = db_with_cfh.db->Flush(flush_opt, db_with_cfh.cfh); + } else { + s = db_with_cfh.db->Flush(flush_opt, + db_with_cfh.db->DefaultColumnFamily()); + } + + if (!s.ok()) { + fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str()); + exit(1); + } + } + } + fprintf(stdout, "flush memtable\n"); + } + + void ResetStats() { + if (db_.db != nullptr) { + db_.db->ResetStats(); + } + for (const auto& db_with_cfh : multi_dbs_) { + db_with_cfh.db->ResetStats(); + } + } + + void PrintStatsHistory() { + if (db_.db != nullptr) { + PrintStatsHistoryImpl(db_.db, false); + } + for (const auto& db_with_cfh : multi_dbs_) { + PrintStatsHistoryImpl(db_with_cfh.db, true); + } + } + + void PrintStatsHistoryImpl(DB* db, bool print_header) { + if (print_header) { + fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str()); + } + + std::unique_ptr<StatsHistoryIterator> shi; + Status s = + db->GetStatsHistory(0, std::numeric_limits<uint64_t>::max(), &shi); + if (!s.ok()) { + fprintf(stdout, "%s\n", s.ToString().c_str()); + return; + } + assert(shi); + while (shi->Valid()) { + uint64_t stats_time = shi->GetStatsTime(); + fprintf(stdout, "------ %s ------\n", + TimeToHumanString(static_cast<int>(stats_time)).c_str()); + for (auto& entry : shi->GetStatsMap()) { + fprintf(stdout, " %" PRIu64 " %s %" PRIu64 "\n", stats_time, + entry.first.c_str(), entry.second); + } + shi->Next(); + } + } + + void PrintStats(const char* key) { + if (db_.db != nullptr) { + PrintStats(db_.db, key, false); + } + for (const auto& db_with_cfh : multi_dbs_) { + PrintStats(db_with_cfh.db, key, true); + } + } + + void PrintStats(DB* db, const char* key, bool print_header = false) { + if (print_header) { + fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str()); + } + std::string stats; + if (!db->GetProperty(key, &stats)) { + stats = "(failed)"; + } + fprintf(stdout, "\n%s\n", stats.c_str()); + } + + void PrintStats(const std::vector<std::string>& keys) { + if (db_.db != nullptr) { + PrintStats(db_.db, keys); + } + for (const auto& db_with_cfh : multi_dbs_) { + PrintStats(db_with_cfh.db, keys, true); + } + } + + void PrintStats(DB* db, const std::vector<std::string>& keys, + bool print_header = false) { + if (print_header) { + fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str()); + } + + for (const auto& key : keys) { + std::string stats; + if (!db->GetProperty(key, &stats)) { + stats = "(failed)"; + } + fprintf(stdout, "%s: %s\n", key.c_str(), stats.c_str()); + } + } + +#ifndef ROCKSDB_LITE + + void Replay(ThreadState* thread) { + if (db_.db != nullptr) { + Replay(thread, &db_); + } + } + + void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) { + Status s; + std::unique_ptr<TraceReader> trace_reader; + s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file, + &trace_reader); + if (!s.ok()) { + fprintf( + stderr, + "Encountered an error creating a TraceReader from the trace file. " + "Error: %s\n", + s.ToString().c_str()); + exit(1); + } + std::unique_ptr<Replayer> replayer; + s = db_with_cfh->db->NewDefaultReplayer(db_with_cfh->cfh, + std::move(trace_reader), &replayer); + if (!s.ok()) { + fprintf(stderr, + "Encountered an error creating a default Replayer. " + "Error: %s\n", + s.ToString().c_str()); + exit(1); + } + s = replayer->Prepare(); + if (!s.ok()) { + fprintf(stderr, "Prepare for replay failed. Error: %s\n", + s.ToString().c_str()); + } + s = replayer->Replay( + ReplayOptions(static_cast<uint32_t>(FLAGS_trace_replay_threads), + FLAGS_trace_replay_fast_forward), + nullptr); + replayer.reset(); + if (s.ok()) { + fprintf(stdout, "Replay completed from trace_file: %s\n", + FLAGS_trace_file.c_str()); + } else { + fprintf(stderr, "Replay failed. Error: %s\n", s.ToString().c_str()); + } + } + + void Backup(ThreadState* thread) { + DB* db = SelectDB(thread); + std::unique_ptr<BackupEngineOptions> engine_options( + new BackupEngineOptions(FLAGS_backup_dir)); + Status s; + BackupEngine* backup_engine; + if (FLAGS_backup_rate_limit > 0) { + engine_options->backup_rate_limiter.reset(NewGenericRateLimiter( + FLAGS_backup_rate_limit, 100000 /* refill_period_us */, + 10 /* fairness */, RateLimiter::Mode::kAllIo)); + } + // Build new backup of the entire DB + engine_options->destroy_old_data = true; + s = BackupEngine::Open(FLAGS_env, *engine_options, &backup_engine); + assert(s.ok()); + s = backup_engine->CreateNewBackup(db); + assert(s.ok()); + std::vector<BackupInfo> backup_info; + backup_engine->GetBackupInfo(&backup_info); + // Verify that a new backup is created + assert(backup_info.size() == 1); + } + + void Restore(ThreadState* /* thread */) { + std::unique_ptr<BackupEngineOptions> engine_options( + new BackupEngineOptions(FLAGS_backup_dir)); + if (FLAGS_restore_rate_limit > 0) { + engine_options->restore_rate_limiter.reset(NewGenericRateLimiter( + FLAGS_restore_rate_limit, 100000 /* refill_period_us */, + 10 /* fairness */, RateLimiter::Mode::kAllIo)); + } + BackupEngineReadOnly* backup_engine; + Status s = + BackupEngineReadOnly::Open(FLAGS_env, *engine_options, &backup_engine); + assert(s.ok()); + s = backup_engine->RestoreDBFromLatestBackup(FLAGS_restore_dir, + FLAGS_restore_dir); + assert(s.ok()); + delete backup_engine; + } + +#endif // ROCKSDB_LITE +}; + +int db_bench_tool(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ConfigOptions config_options; + static bool initialized = false; + if (!initialized) { + SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + SetVersionString(GetRocksVersionAsString(true)); + initialized = true; + } + ParseCommandLineFlags(&argc, &argv, true); + FLAGS_compaction_style_e = + (ROCKSDB_NAMESPACE::CompactionStyle)FLAGS_compaction_style; +#ifndef ROCKSDB_LITE + if (FLAGS_statistics && !FLAGS_statistics_string.empty()) { + fprintf(stderr, + "Cannot provide both --statistics and --statistics_string.\n"); + exit(1); + } + if (!FLAGS_statistics_string.empty()) { + Status s = Statistics::CreateFromString(config_options, + FLAGS_statistics_string, &dbstats); + if (dbstats == nullptr) { + fprintf(stderr, + "No Statistics registered matching string: %s status=%s\n", + FLAGS_statistics_string.c_str(), s.ToString().c_str()); + exit(1); + } + } +#endif // ROCKSDB_LITE + if (FLAGS_statistics) { + dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics(); + } + if (dbstats) { + dbstats->set_stats_level(static_cast<StatsLevel>(FLAGS_stats_level)); + } + FLAGS_compaction_pri_e = + (ROCKSDB_NAMESPACE::CompactionPri)FLAGS_compaction_pri; + + std::vector<std::string> fanout = ROCKSDB_NAMESPACE::StringSplit( + FLAGS_max_bytes_for_level_multiplier_additional, ','); + for (size_t j = 0; j < fanout.size(); j++) { + FLAGS_max_bytes_for_level_multiplier_additional_v.push_back( +#ifndef CYGWIN + std::stoi(fanout[j])); +#else + stoi(fanout[j])); +#endif + } + + FLAGS_compression_type_e = + StringToCompressionType(FLAGS_compression_type.c_str()); + + FLAGS_wal_compression_e = + StringToCompressionType(FLAGS_wal_compression.c_str()); + + FLAGS_compressed_secondary_cache_compression_type_e = StringToCompressionType( + FLAGS_compressed_secondary_cache_compression_type.c_str()); + +#ifndef ROCKSDB_LITE + // Stacked BlobDB + FLAGS_blob_db_compression_type_e = + StringToCompressionType(FLAGS_blob_db_compression_type.c_str()); + + int env_opts = !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty(); + if (env_opts > 1) { + fprintf(stderr, "Error: --env_uri and --fs_uri are mutually exclusive\n"); + exit(1); + } + + if (env_opts == 1) { + Status s = Env::CreateFromUri(config_options, FLAGS_env_uri, FLAGS_fs_uri, + &FLAGS_env, &env_guard); + if (!s.ok()) { + fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str()); + exit(1); + } + } else if (FLAGS_simulate_hdd || FLAGS_simulate_hybrid_fs_file != "") { + //**TODO: Make the simulate fs something that can be loaded + // from the ObjectRegistry... + static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env = + NewCompositeEnv(std::make_shared<SimulatedHybridFileSystem>( + FileSystem::Default(), FLAGS_simulate_hybrid_fs_file, + /*throughput_multiplier=*/ + int{FLAGS_simulate_hybrid_hdd_multipliers}, + /*is_full_fs_warm=*/FLAGS_simulate_hdd)); + FLAGS_env = composite_env.get(); + } + + // Let -readonly imply -use_existing_db + FLAGS_use_existing_db |= FLAGS_readonly; +#endif // ROCKSDB_LITE + + if (FLAGS_build_info) { + std::string build_info; + std::cout << GetRocksBuildInfoAsString(build_info, true) << std::endl; + // Similar to --version, nothing else will be done when this flag is set + exit(0); + } + + if (!FLAGS_seed) { + uint64_t now = FLAGS_env->GetSystemClock()->NowMicros(); + seed_base = static_cast<int64_t>(now); + fprintf(stdout, "Set seed to %" PRIu64 " because --seed was 0\n", + seed_base); + } else { + seed_base = FLAGS_seed; + } + + if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) { + fprintf(stderr, + "`-use_existing_db` must be true for `-use_existing_keys` to be " + "settable\n"); + exit(1); + } + + if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE")) + FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NONE; + else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL")) + FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NORMAL; + else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL")) + FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::SEQUENTIAL; + else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED")) + FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::WILLNEED; + else { + fprintf(stdout, "Unknown compaction fadvice:%s\n", + FLAGS_compaction_fadvice.c_str()); + exit(1); + } + + FLAGS_value_size_distribution_type_e = + StringToDistributionType(FLAGS_value_size_distribution_type.c_str()); + + // Note options sanitization may increase thread pool sizes according to + // max_background_flushes/max_background_compactions/max_background_jobs + FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads, + ROCKSDB_NAMESPACE::Env::Priority::HIGH); + FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads, + ROCKSDB_NAMESPACE::Env::Priority::BOTTOM); + FLAGS_env->SetBackgroundThreads(FLAGS_num_low_pri_threads, + ROCKSDB_NAMESPACE::Env::Priority::LOW); + + // Choose a location for the test database if none given with --db=<path> + if (FLAGS_db.empty()) { + std::string default_db_path; + FLAGS_env->GetTestDirectory(&default_db_path); + default_db_path += "/dbbench"; + FLAGS_db = default_db_path; + } + + if (FLAGS_backup_dir.empty()) { + FLAGS_backup_dir = FLAGS_db + "/backup"; + } + + if (FLAGS_restore_dir.empty()) { + FLAGS_restore_dir = FLAGS_db + "/restore"; + } + + if (FLAGS_stats_interval_seconds > 0) { + // When both are set then FLAGS_stats_interval determines the frequency + // at which the timer is checked for FLAGS_stats_interval_seconds + FLAGS_stats_interval = 1000; + } + + if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) { + fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n"); + exit(1); + } + + ROCKSDB_NAMESPACE::Benchmark benchmark; + benchmark.Run(); + +#ifndef ROCKSDB_LITE + if (FLAGS_print_malloc_stats) { + std::string stats_string; + ROCKSDB_NAMESPACE::DumpMallocStats(&stats_string); + fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str()); + } +#endif // ROCKSDB_LITE + + return 0; +} +} // namespace ROCKSDB_NAMESPACE +#endif diff --git a/src/rocksdb/tools/db_bench_tool_test.cc b/src/rocksdb/tools/db_bench_tool_test.cc new file mode 100644 index 000000000..a406ff66c --- /dev/null +++ b/src/rocksdb/tools/db_bench_tool_test.cc @@ -0,0 +1,334 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/db_bench_tool.h" + +#include "db/db_impl/db_impl.h" +#include "options/options_parser.h" +#include "rocksdb/utilities/options_util.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/random.h" + +#ifdef GFLAGS +#include "util/gflags_compat.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +static const int kMaxArgCount = 100; +static const size_t kArgBufferSize = 100000; +} // namespace + +class DBBenchTest : public testing::Test { + public: + DBBenchTest() : rnd_(0xFB) { + test_path_ = test::PerThreadDBPath("db_bench_test"); + Env::Default()->CreateDir(test_path_); + db_path_ = test_path_ + "/db"; + wal_path_ = test_path_ + "/wal"; + } + + ~DBBenchTest() { + // DestroyDB(db_path_, Options()); + } + + void ResetArgs() { + argc_ = 0; + cursor_ = 0; + memset(arg_buffer_, 0, kArgBufferSize); + } + + void AppendArgs(const std::vector<std::string>& args) { + for (const auto& arg : args) { + ASSERT_LE(cursor_ + arg.size() + 1, kArgBufferSize); + ASSERT_LE(argc_ + 1, kMaxArgCount); + snprintf(arg_buffer_ + cursor_, arg.size() + 1, "%s", arg.c_str()); + + argv_[argc_++] = arg_buffer_ + cursor_; + cursor_ += arg.size() + 1; + } + } + + // Gets the default options for this test/db_bench. + // Note that db_bench may change some of the default option values and that + // the database might as well. The options changed by db_bench are + // specified here; the ones by the DB are set via SanitizeOptions + Options GetDefaultOptions(CompactionStyle style = kCompactionStyleLevel, + int levels = 7) const { + Options opt; + + opt.create_if_missing = true; + opt.max_open_files = 256; + opt.max_background_compactions = 10; + opt.dump_malloc_stats = true; // db_bench uses a different default + opt.compaction_style = style; + opt.num_levels = levels; + opt.compression = kNoCompression; + opt.arena_block_size = 8388608; + + return SanitizeOptions(db_path_, opt); + } + + void RunDbBench(const std::string& options_file_name) { + AppendArgs({"./db_bench", "--benchmarks=fillseq", "--use_existing_db=0", + "--num=1000", "--compression_type=none", + std::string(std::string("--db=") + db_path_).c_str(), + std::string(std::string("--wal_dir=") + wal_path_).c_str(), + std::string(std::string("--options_file=") + options_file_name) + .c_str()}); + ASSERT_EQ(0, db_bench_tool(argc(), argv())); + } + + void VerifyOptions(const Options& opt) { + DBOptions loaded_db_opts; + std::vector<ColumnFamilyDescriptor> cf_descs; + ASSERT_OK(LoadLatestOptions(db_path_, Env::Default(), &loaded_db_opts, + &cf_descs)); + + ConfigOptions exact; + exact.input_strings_escaped = false; + exact.sanity_level = ConfigOptions::kSanityLevelExactMatch; + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(exact, DBOptions(opt), + loaded_db_opts)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( + exact, ColumnFamilyOptions(opt), cf_descs[0].options)); + + // check with the default rocksdb options and expect failure + ASSERT_NOK(RocksDBOptionsParser::VerifyDBOptions(exact, DBOptions(), + loaded_db_opts)); + ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions( + exact, ColumnFamilyOptions(), cf_descs[0].options)); + } + + char** argv() { return argv_; } + + int argc() { return argc_; } + + std::string db_path_; + std::string test_path_; + std::string wal_path_; + + char arg_buffer_[kArgBufferSize]; + char* argv_[kMaxArgCount]; + int argc_ = 0; + int cursor_ = 0; + Random rnd_; +}; + +namespace {} // namespace + +TEST_F(DBBenchTest, OptionsFile) { + const std::string kOptionsFileName = test_path_ + "/OPTIONS_test"; + Options opt = GetDefaultOptions(); + ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"}, + {ColumnFamilyOptions(opt)}, kOptionsFileName, + opt.env->GetFileSystem().get())); + + // override the following options as db_bench will not take these + // options from the options file + opt.wal_dir = wal_path_; + + RunDbBench(kOptionsFileName); + opt.delayed_write_rate = 16 * 1024 * 1024; // Set by SanitizeOptions + + VerifyOptions(opt); +} + +TEST_F(DBBenchTest, OptionsFileUniversal) { + const std::string kOptionsFileName = test_path_ + "/OPTIONS_test"; + + Options opt = GetDefaultOptions(kCompactionStyleUniversal, 1); + + ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"}, + {ColumnFamilyOptions(opt)}, kOptionsFileName, + opt.env->GetFileSystem().get())); + + // override the following options as db_bench will not take these + // options from the options file + opt.wal_dir = wal_path_; + RunDbBench(kOptionsFileName); + + VerifyOptions(opt); +} + +TEST_F(DBBenchTest, OptionsFileMultiLevelUniversal) { + const std::string kOptionsFileName = test_path_ + "/OPTIONS_test"; + + Options opt = GetDefaultOptions(kCompactionStyleUniversal, 12); + + ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"}, + {ColumnFamilyOptions(opt)}, kOptionsFileName, + opt.env->GetFileSystem().get())); + + // override the following options as db_bench will not take these + // options from the options file + opt.wal_dir = wal_path_; + + RunDbBench(kOptionsFileName); + VerifyOptions(opt); +} + +const std::string options_file_content = R"OPTIONS_FILE( +[Version] + rocksdb_version=4.3.1 + options_file_version=1.1 + +[DBOptions] + wal_bytes_per_sync=1048576 + delete_obsolete_files_period_micros=0 + WAL_ttl_seconds=0 + WAL_size_limit_MB=0 + db_write_buffer_size=0 + max_subcompactions=1 + table_cache_numshardbits=4 + max_open_files=-1 + max_file_opening_threads=10 + max_background_compactions=5 + use_fsync=false + use_adaptive_mutex=false + max_total_wal_size=18446744073709551615 + compaction_readahead_size=0 + keep_log_file_num=10 + skip_stats_update_on_db_open=false + max_manifest_file_size=18446744073709551615 + db_log_dir= + writable_file_max_buffer_size=1048576 + paranoid_checks=true + is_fd_close_on_exec=true + bytes_per_sync=1048576 + enable_thread_tracking=true + recycle_log_file_num=0 + create_missing_column_families=false + log_file_time_to_roll=0 + max_background_flushes=1 + create_if_missing=true + error_if_exists=false + delayed_write_rate=1048576 + manifest_preallocation_size=4194304 + allow_mmap_reads=false + allow_mmap_writes=false + use_direct_reads=false + use_direct_io_for_flush_and_compaction=false + stats_dump_period_sec=600 + allow_fallocate=true + max_log_file_size=83886080 + random_access_max_buffer_size=1048576 + advise_random_on_open=true + dump_malloc_stats=true + +[CFOptions "default"] + compaction_filter_factory=nullptr + table_factory=BlockBasedTable + prefix_extractor=nullptr + comparator=leveldb.BytewiseComparator + compression_per_level= + max_bytes_for_level_base=104857600 + bloom_locality=0 + target_file_size_base=10485760 + memtable_huge_page_size=0 + max_successive_merges=1000 + max_sequential_skip_in_iterations=8 + arena_block_size=52428800 + target_file_size_multiplier=1 + source_compaction_factor=1 + min_write_buffer_number_to_merge=1 + max_write_buffer_number=2 + write_buffer_size=419430400 + max_grandparent_overlap_factor=10 + max_bytes_for_level_multiplier=10 + memtable_factory=SkipListFactory + compression=kNoCompression + min_partial_merge_operands=2 + level0_stop_writes_trigger=100 + num_levels=1 + level0_slowdown_writes_trigger=50 + level0_file_num_compaction_trigger=10 + expanded_compaction_factor=25 + max_write_buffer_number_to_maintain=0 + max_write_buffer_size_to_maintain=0 + verify_checksums_in_compaction=true + merge_operator=nullptr + memtable_prefix_bloom_bits=0 + memtable_whole_key_filtering=true + paranoid_file_checks=false + inplace_update_num_locks=10000 + optimize_filters_for_hits=false + level_compaction_dynamic_level_bytes=false + inplace_update_support=false + compaction_style=kCompactionStyleUniversal + memtable_prefix_bloom_probes=6 + filter_deletes=false + hard_pending_compaction_bytes_limit=0 + disable_auto_compactions=false + compaction_measure_io_stats=false + enable_blob_files=true + min_blob_size=16 + blob_file_size=10485760 + blob_compression_type=kNoCompression + enable_blob_garbage_collection=true + blob_garbage_collection_age_cutoff=0.5 + blob_garbage_collection_force_threshold=0.75 + blob_compaction_readahead_size=262144 + blob_file_starting_level=0 + prepopulate_blob_cache=kDisable; + +[TableOptions/BlockBasedTable "default"] + format_version=0 + skip_table_builder_flush=false + cache_index_and_filter_blocks=false + flush_block_policy_factory=FlushBlockBySizePolicyFactory + index_type=kBinarySearch + whole_key_filtering=true + checksum=kCRC32c + no_block_cache=false + block_size=32768 + block_size_deviation=10 + block_restart_interval=16 + filter_policy=rocksdb.BuiltinBloomFilter +)OPTIONS_FILE"; + +TEST_F(DBBenchTest, OptionsFileFromFile) { + const std::string kOptionsFileName = test_path_ + "/OPTIONS_flash"; + std::unique_ptr<WritableFile> writable; + ASSERT_OK(Env::Default()->NewWritableFile(kOptionsFileName, &writable, + EnvOptions())); + ASSERT_OK(writable->Append(options_file_content)); + ASSERT_OK(writable->Close()); + + DBOptions db_opt; + std::vector<ColumnFamilyDescriptor> cf_descs; + ASSERT_OK(LoadOptionsFromFile(kOptionsFileName, Env::Default(), &db_opt, + &cf_descs)); + Options opt(db_opt, cf_descs[0].options); + opt.create_if_missing = true; + + // override the following options as db_bench will not take these + // options from the options file + opt.wal_dir = wal_path_; + + RunDbBench(kOptionsFileName); + + VerifyOptions(SanitizeOptions(db_path_, opt)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + return RUN_ALL_TESTS(); +} + +#else + +int main(int argc, char** argv) { + printf("Skip db_bench_tool_test as the required library GFLAG is missing."); +} +#endif // #ifdef GFLAGS diff --git a/src/rocksdb/tools/db_crashtest.py b/src/rocksdb/tools/db_crashtest.py new file mode 100644 index 000000000..7035908cb --- /dev/null +++ b/src/rocksdb/tools/db_crashtest.py @@ -0,0 +1,1016 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from __future__ import absolute_import, division, print_function, unicode_literals + +import argparse + +import os +import random +import shutil +import subprocess +import sys +import tempfile +import time + +# params overwrite priority: +# for default: +# default_params < {blackbox,whitebox}_default_params < args +# for simple: +# default_params < {blackbox,whitebox}_default_params < +# simple_default_params < +# {blackbox,whitebox}_simple_default_params < args +# for cf_consistency: +# default_params < {blackbox,whitebox}_default_params < +# cf_consistency_params < args +# for txn: +# default_params < {blackbox,whitebox}_default_params < txn_params < args +# for ts: +# default_params < {blackbox,whitebox}_default_params < ts_params < args +# for multiops_txn: +# default_params < {blackbox,whitebox}_default_params < multiops_txn_params < args + + +default_params = { + "acquire_snapshot_one_in": 10000, + "backup_max_size": 100 * 1024 * 1024, + # Consider larger number when backups considered more stable + "backup_one_in": 100000, + "batch_protection_bytes_per_key": lambda: random.choice([0, 8]), + "memtable_protection_bytes_per_key": lambda: random.choice([0, 1, 2, 4, 8]), + "block_size": 16384, + "bloom_bits": lambda: random.choice( + [random.randint(0, 19), random.lognormvariate(2.3, 1.3)] + ), + "cache_index_and_filter_blocks": lambda: random.randint(0, 1), + "cache_size": 8388608, + "charge_compression_dictionary_building_buffer": lambda: random.choice([0, 1]), + "charge_filter_construction": lambda: random.choice([0, 1]), + "charge_table_reader": lambda: random.choice([0, 1]), + "charge_file_metadata": lambda: random.choice([0, 1]), + "checkpoint_one_in": 1000000, + "compression_type": lambda: random.choice( + ["none", "snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"] + ), + "bottommost_compression_type": lambda: "disable" + if random.randint(0, 1) == 0 + else random.choice(["none", "snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"]), + "checksum_type": lambda: random.choice( + ["kCRC32c", "kxxHash", "kxxHash64", "kXXH3"] + ), + "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1), + "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1), + # Disabled compression_parallel_threads as the feature is not stable + # lambda: random.choice([1] * 9 + [4]) + "compression_parallel_threads": 1, + "compression_max_dict_buffer_bytes": lambda: (1 << random.randint(0, 40)) - 1, + "compression_use_zstd_dict_trainer": lambda: random.randint(0, 1), + "clear_column_family_one_in": 0, + "compact_files_one_in": 1000000, + "compact_range_one_in": 1000000, + "compaction_pri": random.randint(0, 4), + "data_block_index_type": lambda: random.choice([0, 1]), + "delpercent": 4, + "delrangepercent": 1, + "destroy_db_initially": 0, + "enable_pipelined_write": lambda: random.randint(0, 1), + "enable_compaction_filter": lambda: random.choice([0, 0, 0, 1]), + "expected_values_dir": lambda: setup_expected_values_dir(), + "fail_if_options_file_error": lambda: random.randint(0, 1), + "flush_one_in": 1000000, + "manual_wal_flush_one_in": lambda: random.choice([0, 0, 1000, 1000000]), + "file_checksum_impl": lambda: random.choice(["none", "crc32c", "xxh64", "big"]), + "get_live_files_one_in": 1000000, + # Note: the following two are intentionally disabled as the corresponding + # APIs are not guaranteed to succeed. + "get_sorted_wal_files_one_in": 0, + "get_current_wal_file_one_in": 0, + # Temporarily disable hash index + "index_type": lambda: random.choice([0, 0, 0, 2, 2, 3]), + "ingest_external_file_one_in": 1000000, + "iterpercent": 10, + "mark_for_compaction_one_file_in": lambda: 10 * random.randint(0, 1), + "max_background_compactions": 20, + "max_bytes_for_level_base": 10485760, + "max_key": 25000000, + "max_write_buffer_number": 3, + "mmap_read": lambda: random.randint(0, 1), + # Setting `nooverwritepercent > 0` is only possible because we do not vary + # the random seed, so the same keys are chosen by every run for disallowing + # overwrites. + "nooverwritepercent": 1, + "open_files": lambda: random.choice([-1, -1, 100, 500000]), + "optimize_filters_for_memory": lambda: random.randint(0, 1), + "partition_filters": lambda: random.randint(0, 1), + "partition_pinning": lambda: random.randint(0, 3), + "pause_background_one_in": 1000000, + "prefix_size": lambda: random.choice([-1, 1, 5, 7, 8]), + "prefixpercent": 5, + "progress_reports": 0, + "readpercent": 45, + "recycle_log_file_num": lambda: random.randint(0, 1), + "snapshot_hold_ops": 100000, + "sst_file_manager_bytes_per_sec": lambda: random.choice([0, 104857600]), + "sst_file_manager_bytes_per_truncate": lambda: random.choice([0, 1048576]), + "long_running_snapshots": lambda: random.randint(0, 1), + "subcompactions": lambda: random.randint(1, 4), + "target_file_size_base": 2097152, + "target_file_size_multiplier": 2, + "test_batches_snapshots": random.randint(0, 1), + "top_level_index_pinning": lambda: random.randint(0, 3), + "unpartitioned_pinning": lambda: random.randint(0, 3), + "use_direct_reads": lambda: random.randint(0, 1), + "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1), + "mock_direct_io": False, + "cache_type": lambda: random.choice(["lru_cache", "hyper_clock_cache"]), + "use_full_merge_v1": lambda: random.randint(0, 1), + "use_merge": lambda: random.randint(0, 1), + # use_put_entity_one_in has to be the same across invocations for verification to work, hence no lambda + "use_put_entity_one_in": random.choice([0] * 7 + [1, 5, 10]), + # 999 -> use Bloom API + "ribbon_starting_level": lambda: random.choice([random.randint(-1, 10), 999]), + "value_size_mult": 32, + "verify_checksum": 1, + "write_buffer_size": 4 * 1024 * 1024, + "writepercent": 35, + "format_version": lambda: random.choice([2, 3, 4, 5, 5]), + "index_block_restart_interval": lambda: random.choice(range(1, 16)), + "use_multiget": lambda: random.randint(0, 1), + "periodic_compaction_seconds": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]), + # 0 = never (used by some), 10 = often (for threading bugs), 600 = default + "stats_dump_period_sec": lambda: random.choice([0, 10, 600]), + "compaction_ttl": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]), + # Test small max_manifest_file_size in a smaller chance, as most of the + # time we wnat manifest history to be preserved to help debug + "max_manifest_file_size": lambda: random.choice( + [t * 16384 if t < 3 else 1024 * 1024 * 1024 for t in range(1, 30)] + ), + # Sync mode might make test runs slower so running it in a smaller chance + "sync": lambda: random.choice([1 if t == 0 else 0 for t in range(0, 20)]), + "bytes_per_sync": lambda: random.choice([0, 262144]), + "wal_bytes_per_sync": lambda: random.choice([0, 524288]), + # Disable compaction_readahead_size because the test is not passing. + # "compaction_readahead_size" : lambda : random.choice( + # [0, 0, 1024 * 1024]), + "db_write_buffer_size": lambda: random.choice( + [0, 0, 0, 1024 * 1024, 8 * 1024 * 1024, 128 * 1024 * 1024] + ), + "avoid_unnecessary_blocking_io": random.randint(0, 1), + "write_dbid_to_manifest": random.randint(0, 1), + "avoid_flush_during_recovery": lambda: random.choice( + [1 if t == 0 else 0 for t in range(0, 8)] + ), + "max_write_batch_group_size_bytes": lambda: random.choice( + [16, 64, 1024 * 1024, 16 * 1024 * 1024] + ), + "level_compaction_dynamic_level_bytes": True, + "verify_checksum_one_in": 1000000, + "verify_db_one_in": 100000, + "continuous_verification_interval": 0, + "max_key_len": 3, + "key_len_percent_dist": "1,30,69", + "read_fault_one_in": lambda: random.choice([0, 32, 1000]), + "open_metadata_write_fault_one_in": lambda: random.choice([0, 0, 8]), + "open_write_fault_one_in": lambda: random.choice([0, 0, 16]), + "open_read_fault_one_in": lambda: random.choice([0, 0, 32]), + "sync_fault_injection": lambda: random.randint(0, 1), + "get_property_one_in": 1000000, + "paranoid_file_checks": lambda: random.choice([0, 1, 1, 1]), + "max_write_buffer_size_to_maintain": lambda: random.choice( + [0, 1024 * 1024, 2 * 1024 * 1024, 4 * 1024 * 1024, 8 * 1024 * 1024] + ), + "user_timestamp_size": 0, + "secondary_cache_fault_one_in": lambda: random.choice([0, 0, 32]), + "prepopulate_block_cache": lambda: random.choice([0, 1]), + "memtable_prefix_bloom_size_ratio": lambda: random.choice([0.001, 0.01, 0.1, 0.5]), + "memtable_whole_key_filtering": lambda: random.randint(0, 1), + "detect_filter_construct_corruption": lambda: random.choice([0, 1]), + "adaptive_readahead": lambda: random.choice([0, 1]), + "async_io": lambda: random.choice([0, 1]), + "wal_compression": lambda: random.choice(["none", "zstd"]), + "verify_sst_unique_id_in_manifest": 1, # always do unique_id verification + "secondary_cache_uri": lambda: random.choice( + [ + "", + "compressed_secondary_cache://capacity=8388608", + "compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true", + ] + ), + "allow_data_in_errors": True, + "readahead_size": lambda: random.choice([0, 16384, 524288]), + "initial_auto_readahead_size": lambda: random.choice([0, 16384, 524288]), + "max_auto_readahead_size": lambda: random.choice([0, 16384, 524288]), + "num_file_reads_for_auto_readahead": lambda: random.choice([0, 1, 2]), + "min_write_buffer_number_to_merge": lambda: random.choice([1, 2]), + "preserve_internal_time_seconds": lambda: random.choice([0, 60, 3600, 36000]), +} + +_TEST_DIR_ENV_VAR = "TEST_TMPDIR" +_DEBUG_LEVEL_ENV_VAR = "DEBUG_LEVEL" + +stress_cmd = "./db_stress" +cleanup_cmd = None + + +def is_release_mode(): + return os.environ.get(_DEBUG_LEVEL_ENV_VAR) == "0" + + +def get_dbname(test_name): + test_dir_name = "rocksdb_crashtest_" + test_name + test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR) + if test_tmpdir is None or test_tmpdir == "": + dbname = tempfile.mkdtemp(prefix=test_dir_name) + else: + dbname = test_tmpdir + "/" + test_dir_name + shutil.rmtree(dbname, True) + if cleanup_cmd is not None: + print("Running DB cleanup command - %s\n" % cleanup_cmd) + # Ignore failure + os.system(cleanup_cmd) + os.mkdir(dbname) + return dbname + + +expected_values_dir = None + + +def setup_expected_values_dir(): + global expected_values_dir + if expected_values_dir is not None: + return expected_values_dir + expected_dir_prefix = "rocksdb_crashtest_expected_" + test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR) + if test_tmpdir is None or test_tmpdir == "": + expected_values_dir = tempfile.mkdtemp(prefix=expected_dir_prefix) + else: + # if tmpdir is specified, store the expected_values_dir under that dir + expected_values_dir = test_tmpdir + "/rocksdb_crashtest_expected" + if os.path.exists(expected_values_dir): + shutil.rmtree(expected_values_dir) + os.mkdir(expected_values_dir) + return expected_values_dir + + +multiops_txn_key_spaces_file = None + + +def setup_multiops_txn_key_spaces_file(): + global multiops_txn_key_spaces_file + if multiops_txn_key_spaces_file is not None: + return multiops_txn_key_spaces_file + key_spaces_file_prefix = "rocksdb_crashtest_multiops_txn_key_spaces" + test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR) + if test_tmpdir is None or test_tmpdir == "": + multiops_txn_key_spaces_file = tempfile.mkstemp(prefix=key_spaces_file_prefix)[ + 1 + ] + else: + if not os.path.exists(test_tmpdir): + os.mkdir(test_tmpdir) + multiops_txn_key_spaces_file = tempfile.mkstemp( + prefix=key_spaces_file_prefix, dir=test_tmpdir + )[1] + return multiops_txn_key_spaces_file + + +def is_direct_io_supported(dbname): + with tempfile.NamedTemporaryFile(dir=dbname) as f: + try: + os.open(f.name, os.O_DIRECT) + except BaseException: + return False + return True + + +blackbox_default_params = { + "disable_wal": lambda: random.choice([0, 0, 0, 1]), + # total time for this script to test db_stress + "duration": 6000, + # time for one db_stress instance to run + "interval": 120, + # since we will be killing anyway, use large value for ops_per_thread + "ops_per_thread": 100000000, + "reopen": 0, + "set_options_one_in": 10000, +} + +whitebox_default_params = { + # TODO: enable this once we figure out how to adjust kill odds for WAL- + # disabled runs, and either (1) separate full `db_stress` runs out of + # whitebox crash or (2) support verification at end of `db_stress` runs + # that ran with WAL disabled. + "disable_wal": 0, + "duration": 10000, + "log2_keys_per_lock": 10, + "ops_per_thread": 200000, + "random_kill_odd": 888887, + "reopen": 20, +} + +simple_default_params = { + "allow_concurrent_memtable_write": lambda: random.randint(0, 1), + "column_families": 1, + # TODO: re-enable once internal task T124324915 is fixed. + # "experimental_mempurge_threshold": lambda: 10.0*random.random(), + "max_background_compactions": 1, + "max_bytes_for_level_base": 67108864, + "memtablerep": "skip_list", + "target_file_size_base": 16777216, + "target_file_size_multiplier": 1, + "test_batches_snapshots": 0, + "write_buffer_size": 32 * 1024 * 1024, + "level_compaction_dynamic_level_bytes": False, + "paranoid_file_checks": lambda: random.choice([0, 1, 1, 1]), + "verify_iterator_with_expected_state_one_in": 5, # this locks a range of keys +} + +blackbox_simple_default_params = { + "open_files": -1, + "set_options_one_in": 0, +} + +whitebox_simple_default_params = {} + +cf_consistency_params = { + "disable_wal": lambda: random.randint(0, 1), + "reopen": 0, + "test_cf_consistency": 1, + # use small value for write_buffer_size so that RocksDB triggers flush + # more frequently + "write_buffer_size": 1024 * 1024, + "enable_pipelined_write": lambda: random.randint(0, 1), + # Snapshots are used heavily in this test mode, while they are incompatible + # with compaction filter. + "enable_compaction_filter": 0, + # `CfConsistencyStressTest::TestIngestExternalFile()` is not implemented. + "ingest_external_file_one_in": 0, +} + +txn_params = { + "use_txn": 1, + # Avoid lambda to set it once for the entire test + "txn_write_policy": random.randint(0, 2), + "unordered_write": random.randint(0, 1), + # TODO: there is such a thing as transactions with WAL disabled. We should + # cover that case. + "disable_wal": 0, + # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns + "checkpoint_one_in": 0, + # pipeline write is not currnetly compatible with WritePrepared txns + "enable_pipelined_write": 0, + "create_timestamped_snapshot_one_in": random.choice([0, 20]), + # PutEntity in transactions is not yet implemented + "use_put_entity_one_in" : 0, +} + +best_efforts_recovery_params = { + "best_efforts_recovery": 1, + "atomic_flush": 0, + "disable_wal": 1, + "column_families": 1, +} + +blob_params = { + "allow_setting_blob_options_dynamically": 1, + # Enable blob files and GC with a 75% chance initially; note that they might still be + # enabled/disabled during the test via SetOptions + "enable_blob_files": lambda: random.choice([0] + [1] * 3), + "min_blob_size": lambda: random.choice([0, 8, 16]), + "blob_file_size": lambda: random.choice([1048576, 16777216, 268435456, 1073741824]), + "blob_compression_type": lambda: random.choice(["none", "snappy", "lz4", "zstd"]), + "enable_blob_garbage_collection": lambda: random.choice([0] + [1] * 3), + "blob_garbage_collection_age_cutoff": lambda: random.choice( + [0.0, 0.25, 0.5, 0.75, 1.0] + ), + "blob_garbage_collection_force_threshold": lambda: random.choice([0.5, 0.75, 1.0]), + "blob_compaction_readahead_size": lambda: random.choice([0, 1048576, 4194304]), + "blob_file_starting_level": lambda: random.choice( + [0] * 4 + [1] * 3 + [2] * 2 + [3] + ), + "use_blob_cache": lambda: random.randint(0, 1), + "use_shared_block_and_blob_cache": lambda: random.randint(0, 1), + "blob_cache_size": lambda: random.choice([1048576, 2097152, 4194304, 8388608]), + "prepopulate_blob_cache": lambda: random.randint(0, 1), +} + +ts_params = { + "test_cf_consistency": 0, + "test_batches_snapshots": 0, + "user_timestamp_size": 8, + "use_merge": 0, + "use_full_merge_v1": 0, + "use_txn": 0, + "enable_blob_files": 0, + "use_blob_db": 0, + "ingest_external_file_one_in": 0, + # PutEntity with timestamps is not yet implemented + "use_put_entity_one_in" : 0, +} + +tiered_params = { + "enable_tiered_storage": 1, + # Set tiered compaction hot data time as: 1 minute, 1 hour, 10 hour + "preclude_last_level_data_seconds": lambda: random.choice([60, 3600, 36000]), + # only test universal compaction for now, level has known issue of + # endless compaction + "compaction_style": 1, + # tiered storage doesn't support blob db yet + "enable_blob_files": 0, + "use_blob_db": 0, +} + +multiops_txn_default_params = { + "test_cf_consistency": 0, + "test_batches_snapshots": 0, + "test_multi_ops_txns": 1, + "use_txn": 1, + "two_write_queues": lambda: random.choice([0, 1]), + # TODO: enable write-prepared + "disable_wal": 0, + "use_only_the_last_commit_time_batch_for_recovery": lambda: random.choice([0, 1]), + "clear_column_family_one_in": 0, + "column_families": 1, + "enable_pipelined_write": lambda: random.choice([0, 1]), + # This test already acquires snapshots in reads + "acquire_snapshot_one_in": 0, + "backup_one_in": 0, + "writepercent": 0, + "delpercent": 0, + "delrangepercent": 0, + "customopspercent": 80, + "readpercent": 5, + "iterpercent": 15, + "prefixpercent": 0, + "verify_db_one_in": 1000, + "continuous_verification_interval": 1000, + "delay_snapshot_read_one_in": 3, + # 65536 is the smallest possible value for write_buffer_size. Smaller + # values will be sanitized to 65536 during db open. SetOptions currently + # does not sanitize options, but very small write_buffer_size may cause + # assertion failure in + # https://github.com/facebook/rocksdb/blob/7.0.fb/db/memtable.cc#L117. + "write_buffer_size": 65536, + # flush more frequently to generate more files, thus trigger more + # compactions. + "flush_one_in": 1000, + "key_spaces_path": setup_multiops_txn_key_spaces_file(), + "rollback_one_in": 4, + # Re-enable once we have a compaction for MultiOpsTxnStressTest + "enable_compaction_filter": 0, + "create_timestamped_snapshot_one_in": 50, + "sync_fault_injection": 0, + # PutEntity in transactions is not yet implemented + "use_put_entity_one_in" : 0, +} + +multiops_wc_txn_params = { + "txn_write_policy": 0, + # TODO re-enable pipelined write. Not well tested atm + "enable_pipelined_write": 0, +} + +multiops_wp_txn_params = { + "txn_write_policy": 1, + "wp_snapshot_cache_bits": 1, + # try small wp_commit_cache_bits, e.g. 0 once we explore storing full + # commit sequence numbers in commit cache + "wp_commit_cache_bits": 10, + # pipeline write is not currnetly compatible with WritePrepared txns + "enable_pipelined_write": 0, + # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns + "checkpoint_one_in": 0, + # Required to be 1 in order to use commit-time-batch + "use_only_the_last_commit_time_batch_for_recovery": 1, + "clear_wp_commit_cache_one_in": 10, + "create_timestamped_snapshot_one_in": 0, +} + + +def finalize_and_sanitize(src_params): + dest_params = {k: v() if callable(v) else v for (k, v) in src_params.items()} + if is_release_mode(): + dest_params["read_fault_one_in"] = 0 + if dest_params.get("compression_max_dict_bytes") == 0: + dest_params["compression_zstd_max_train_bytes"] = 0 + dest_params["compression_max_dict_buffer_bytes"] = 0 + if dest_params.get("compression_type") != "zstd": + dest_params["compression_zstd_max_train_bytes"] = 0 + if dest_params.get("allow_concurrent_memtable_write", 1) == 1: + dest_params["memtablerep"] = "skip_list" + if dest_params["mmap_read"] == 1: + dest_params["use_direct_io_for_flush_and_compaction"] = 0 + dest_params["use_direct_reads"] = 0 + if dest_params["file_checksum_impl"] != "none": + # TODO(T109283569): there is a bug in `GenerateOneFileChecksum()`, + # used by `IngestExternalFile()`, causing it to fail with mmap + # reads. Remove this once it is fixed. + dest_params["ingest_external_file_one_in"] = 0 + if ( + dest_params["use_direct_io_for_flush_and_compaction"] == 1 + or dest_params["use_direct_reads"] == 1 + ) and not is_direct_io_supported(dest_params["db"]): + if is_release_mode(): + print( + "{} does not support direct IO. Disabling use_direct_reads and " + "use_direct_io_for_flush_and_compaction.\n".format(dest_params["db"]) + ) + dest_params["use_direct_reads"] = 0 + dest_params["use_direct_io_for_flush_and_compaction"] = 0 + else: + dest_params["mock_direct_io"] = True + + if dest_params["test_batches_snapshots"] == 1: + dest_params["enable_compaction_filter"] = 0 + if dest_params["prefix_size"] < 0: + dest_params["prefix_size"] = 1 + + # Multi-key operations are not currently compatible with transactions or + # timestamp. + if (dest_params.get("test_batches_snapshots") == 1 or + dest_params.get("use_txn") == 1 or + dest_params.get("user_timestamp_size") > 0): + dest_params["ingest_external_file_one_in"] = 0 + if (dest_params.get("test_batches_snapshots") == 1 or + dest_params.get("use_txn") == 1): + dest_params["delpercent"] += dest_params["delrangepercent"] + dest_params["delrangepercent"] = 0 + if ( + dest_params.get("disable_wal") == 1 + or dest_params.get("sync_fault_injection") == 1 + or dest_params.get("manual_wal_flush_one_in") > 0 + ): + # File ingestion does not guarantee prefix-recoverability when unsynced + # data can be lost. Ingesting a file syncs data immediately that is + # newer than unsynced memtable data that can be lost on restart. + # + # Even if the above issue is fixed or worked around, our + # trace-and-replay does not trace file ingestion, so in its current form + # it would not recover the expected state to the correct point in time. + dest_params["ingest_external_file_one_in"] = 0 + # The `DbStressCompactionFilter` can apply memtable updates to SST + # files, which would be problematic when unsynced data can be lost in + # crash recoveries. + dest_params["enable_compaction_filter"] = 0 + # Only under WritePrepared txns, unordered_write would provide the same guarnatees as vanilla rocksdb + if dest_params.get("unordered_write", 0) == 1: + dest_params["txn_write_policy"] = 1 + dest_params["allow_concurrent_memtable_write"] = 1 + if dest_params.get("disable_wal", 0) == 1: + dest_params["atomic_flush"] = 1 + dest_params["sync"] = 0 + dest_params["write_fault_one_in"] = 0 + if dest_params.get("open_files", 1) != -1: + # Compaction TTL and periodic compactions are only compatible + # with open_files = -1 + dest_params["compaction_ttl"] = 0 + dest_params["periodic_compaction_seconds"] = 0 + if dest_params.get("compaction_style", 0) == 2: + # Disable compaction TTL in FIFO compaction, because right + # now assertion failures are triggered. + dest_params["compaction_ttl"] = 0 + dest_params["periodic_compaction_seconds"] = 0 + if dest_params["partition_filters"] == 1: + if dest_params["index_type"] != 2: + dest_params["partition_filters"] = 0 + if dest_params.get("atomic_flush", 0) == 1: + # disable pipelined write when atomic flush is used. + dest_params["enable_pipelined_write"] = 0 + if dest_params.get("sst_file_manager_bytes_per_sec", 0) == 0: + dest_params["sst_file_manager_bytes_per_truncate"] = 0 + if dest_params.get("enable_compaction_filter", 0) == 1: + # Compaction filter is incompatible with snapshots. Need to avoid taking + # snapshots, as well as avoid operations that use snapshots for + # verification. + dest_params["acquire_snapshot_one_in"] = 0 + dest_params["compact_range_one_in"] = 0 + # Give the iterator ops away to reads. + dest_params["readpercent"] += dest_params.get("iterpercent", 10) + dest_params["iterpercent"] = 0 + if dest_params.get("prefix_size") == -1: + dest_params["readpercent"] += dest_params.get("prefixpercent", 20) + dest_params["prefixpercent"] = 0 + if ( + dest_params.get("prefix_size") == -1 + and dest_params.get("memtable_whole_key_filtering") == 0 + ): + dest_params["memtable_prefix_bloom_size_ratio"] = 0 + if dest_params.get("two_write_queues") == 1: + dest_params["enable_pipelined_write"] = 0 + if dest_params.get("best_efforts_recovery") == 1: + dest_params["disable_wal"] = 1 + dest_params["atomic_flush"] = 0 + dest_params["enable_compaction_filter"] = 0 + dest_params["sync"] = 0 + dest_params["write_fault_one_in"] = 0 + if dest_params["secondary_cache_uri"] != "": + # Currently the only cache type compatible with a secondary cache is LRUCache + dest_params["cache_type"] = "lru_cache" + # Remove the following once write-prepared/write-unprepared with/without + # unordered write supports timestamped snapshots + if dest_params.get("create_timestamped_snapshot_one_in", 0) > 0: + dest_params["txn_write_policy"] = 0 + dest_params["unordered_write"] = 0 + # For TransactionDB, correctness testing with unsync data loss is currently + # compatible with only write committed policy + if (dest_params.get("use_txn") == 1 and dest_params.get("txn_write_policy") != 0): + dest_params["sync_fault_injection"] = 0 + dest_params["manual_wal_flush_one_in"] = 0 + # PutEntity is currently not supported by SstFileWriter or in conjunction with Merge + if dest_params["use_put_entity_one_in"] != 0: + dest_params["ingest_external_file_one_in"] = 0 + dest_params["use_merge"] = 0 + dest_params["use_full_merge_v1"] = 0 + + return dest_params + + +def gen_cmd_params(args): + params = {} + + params.update(default_params) + if args.test_type == "blackbox": + params.update(blackbox_default_params) + if args.test_type == "whitebox": + params.update(whitebox_default_params) + if args.simple: + params.update(simple_default_params) + if args.test_type == "blackbox": + params.update(blackbox_simple_default_params) + if args.test_type == "whitebox": + params.update(whitebox_simple_default_params) + if args.cf_consistency: + params.update(cf_consistency_params) + if args.txn: + params.update(txn_params) + if args.test_best_efforts_recovery: + params.update(best_efforts_recovery_params) + if args.enable_ts: + params.update(ts_params) + if args.test_multiops_txn: + params.update(multiops_txn_default_params) + if args.write_policy == "write_committed": + params.update(multiops_wc_txn_params) + elif args.write_policy == "write_prepared": + params.update(multiops_wp_txn_params) + if args.test_tiered_storage: + params.update(tiered_params) + + # Best-effort recovery, user defined timestamp, tiered storage are currently + # incompatible with BlobDB. Test BE recovery if specified on the command + # line; otherwise, apply BlobDB related overrides with a 10% chance. + if ( + not args.test_best_efforts_recovery + and not args.enable_ts + and not args.test_tiered_storage + and random.choice([0] * 9 + [1]) == 1 + ): + params.update(blob_params) + + for k, v in vars(args).items(): + if v is not None: + params[k] = v + return params + + +def gen_cmd(params, unknown_params): + finalzied_params = finalize_and_sanitize(params) + cmd = ( + [stress_cmd] + + [ + "--{0}={1}".format(k, v) + for k, v in [(k, finalzied_params[k]) for k in sorted(finalzied_params)] + if k + not in { + "test_type", + "simple", + "duration", + "interval", + "random_kill_odd", + "cf_consistency", + "txn", + "test_best_efforts_recovery", + "enable_ts", + "test_multiops_txn", + "write_policy", + "stress_cmd", + "test_tiered_storage", + "cleanup_cmd", + } + and v is not None + ] + + unknown_params + ) + return cmd + + +def execute_cmd(cmd, timeout): + child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + print("Running db_stress with pid=%d: %s\n\n" % (child.pid, " ".join(cmd))) + + try: + outs, errs = child.communicate(timeout=timeout) + hit_timeout = False + print("WARNING: db_stress ended before kill: exitcode=%d\n" % child.returncode) + except subprocess.TimeoutExpired: + hit_timeout = True + child.kill() + print("KILLED %d\n" % child.pid) + outs, errs = child.communicate() + + return hit_timeout, child.returncode, outs.decode("utf-8"), errs.decode("utf-8") + + +# This script runs and kills db_stress multiple times. It checks consistency +# in case of unsafe crashes in RocksDB. +def blackbox_crash_main(args, unknown_args): + cmd_params = gen_cmd_params(args) + dbname = get_dbname("blackbox") + exit_time = time.time() + cmd_params["duration"] + + print( + "Running blackbox-crash-test with \n" + + "interval_between_crash=" + + str(cmd_params["interval"]) + + "\n" + + "total-duration=" + + str(cmd_params["duration"]) + + "\n" + ) + + while time.time() < exit_time: + cmd = gen_cmd( + dict(list(cmd_params.items()) + list({"db": dbname}.items())), unknown_args + ) + + hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params["interval"]) + + if not hit_timeout: + print("Exit Before Killing") + print("stdout:") + print(outs) + print("stderr:") + print(errs) + sys.exit(2) + + for line in errs.split("\n"): + if line != "" and not line.startswith("WARNING"): + print("stderr has error message:") + print("***" + line + "***") + + time.sleep(1) # time to stabilize before the next run + + time.sleep(1) # time to stabilize before the next run + + # we need to clean up after ourselves -- only do this on test success + shutil.rmtree(dbname, True) + + +# This python script runs db_stress multiple times. Some runs with +# kill_random_test that causes rocksdb to crash at various points in code. +def whitebox_crash_main(args, unknown_args): + cmd_params = gen_cmd_params(args) + dbname = get_dbname("whitebox") + + cur_time = time.time() + exit_time = cur_time + cmd_params["duration"] + half_time = cur_time + cmd_params["duration"] // 2 + + print( + "Running whitebox-crash-test with \n" + + "total-duration=" + + str(cmd_params["duration"]) + + "\n" + ) + + total_check_mode = 4 + check_mode = 0 + kill_random_test = cmd_params["random_kill_odd"] + kill_mode = 0 + prev_compaction_style = -1 + while time.time() < exit_time: + if check_mode == 0: + additional_opts = { + # use large ops per thread since we will kill it anyway + "ops_per_thread": 100 + * cmd_params["ops_per_thread"], + } + # run with kill_random_test, with three modes. + # Mode 0 covers all kill points. Mode 1 covers less kill points but + # increases change of triggering them. Mode 2 covers even less + # frequent kill points and further increases triggering change. + if kill_mode == 0: + additional_opts.update( + { + "kill_random_test": kill_random_test, + } + ) + elif kill_mode == 1: + if cmd_params.get("disable_wal", 0) == 1: + my_kill_odd = kill_random_test // 50 + 1 + else: + my_kill_odd = kill_random_test // 10 + 1 + additional_opts.update( + { + "kill_random_test": my_kill_odd, + "kill_exclude_prefixes": "WritableFileWriter::Append," + + "WritableFileWriter::WriteBuffered", + } + ) + elif kill_mode == 2: + # TODO: May need to adjust random odds if kill_random_test + # is too small. + additional_opts.update( + { + "kill_random_test": (kill_random_test // 5000 + 1), + "kill_exclude_prefixes": "WritableFileWriter::Append," + "WritableFileWriter::WriteBuffered," + "PosixMmapFile::Allocate,WritableFileWriter::Flush", + } + ) + # Run kill mode 0, 1 and 2 by turn. + kill_mode = (kill_mode + 1) % 3 + elif check_mode == 1: + # normal run with universal compaction mode + additional_opts = { + "kill_random_test": None, + "ops_per_thread": cmd_params["ops_per_thread"], + "compaction_style": 1, + } + # Single level universal has a lot of special logic. Ensure we cover + # it sometimes. + if random.randint(0, 1) == 1: + additional_opts.update( + { + "num_levels": 1, + } + ) + elif check_mode == 2: + # normal run with FIFO compaction mode + # ops_per_thread is divided by 5 because FIFO compaction + # style is quite a bit slower on reads with lot of files + additional_opts = { + "kill_random_test": None, + "ops_per_thread": cmd_params["ops_per_thread"] // 5, + "compaction_style": 2, + } + else: + # normal run + additional_opts = { + "kill_random_test": None, + "ops_per_thread": cmd_params["ops_per_thread"], + } + + cur_compaction_style = additional_opts.get("compaction_style", cmd_params.get("compaction_style", 0)) + if prev_compaction_style != -1 and prev_compaction_style != cur_compaction_style: + print("`compaction_style` is changed in current run so `destroy_db_initially` is set to 1 as a short-term solution to avoid cycling through previous db of different compaction style." + "\n") + additional_opts["destroy_db_initially"] = 1 + prev_compaction_style = cur_compaction_style + + cmd = gen_cmd( + dict( + list(cmd_params.items()) + + list(additional_opts.items()) + + list({"db": dbname}.items()) + ), + unknown_args, + ) + + print( + "Running:" + " ".join(cmd) + "\n" + ) # noqa: E999 T25377293 Grandfathered in + + # If the running time is 15 minutes over the run time, explicit kill and + # exit even if white box kill didn't hit. This is to guarantee run time + # limit, as if it runs as a job, running too long will create problems + # for job scheduling or execution. + # TODO detect a hanging condition. The job might run too long as RocksDB + # hits a hanging bug. + hit_timeout, retncode, stdoutdata, stderrdata = execute_cmd( + cmd, exit_time - time.time() + 900 + ) + msg = "check_mode={0}, kill option={1}, exitcode={2}\n".format( + check_mode, additional_opts["kill_random_test"], retncode + ) + + print(msg) + print(stdoutdata) + print(stderrdata) + + if hit_timeout: + print("Killing the run for running too long") + break + + expected = False + if additional_opts["kill_random_test"] is None and (retncode == 0): + # we expect zero retncode if no kill option + expected = True + elif additional_opts["kill_random_test"] is not None and retncode <= 0: + # When kill option is given, the test MIGHT kill itself. + # If it does, negative retncode is expected. Otherwise 0. + expected = True + + if not expected: + print("TEST FAILED. See kill option and exit code above!!!\n") + sys.exit(1) + + stderrdata = stderrdata.lower() + errorcount = stderrdata.count("error") - stderrdata.count("got errors 0 times") + print("#times error occurred in output is " + str(errorcount) + "\n") + + if errorcount > 0: + print("TEST FAILED. Output has 'error'!!!\n") + sys.exit(2) + if stderrdata.find("fail") >= 0: + print("TEST FAILED. Output has 'fail'!!!\n") + sys.exit(2) + + # First half of the duration, keep doing kill test. For the next half, + # try different modes. + if time.time() > half_time: + # we need to clean up after ourselves -- only do this on test + # success + shutil.rmtree(dbname, True) + if cleanup_cmd is not None: + print("Running DB cleanup command - %s\n" % cleanup_cmd) + ret = os.system(cleanup_cmd) + if ret != 0: + print("TEST FAILED. DB cleanup returned error %d\n" % ret) + sys.exit(1) + os.mkdir(dbname) + if (expected_values_dir is not None): + shutil.rmtree(expected_values_dir, True) + os.mkdir(expected_values_dir) + + check_mode = (check_mode + 1) % total_check_mode + + time.sleep(1) # time to stabilize after a kill + + +def main(): + global stress_cmd + global cleanup_cmd + + parser = argparse.ArgumentParser( + description="This script runs and kills \ + db_stress multiple times" + ) + parser.add_argument("test_type", choices=["blackbox", "whitebox"]) + parser.add_argument("--simple", action="store_true") + parser.add_argument("--cf_consistency", action="store_true") + parser.add_argument("--txn", action="store_true") + parser.add_argument("--test_best_efforts_recovery", action="store_true") + parser.add_argument("--enable_ts", action="store_true") + parser.add_argument("--test_multiops_txn", action="store_true") + parser.add_argument("--write_policy", choices=["write_committed", "write_prepared"]) + parser.add_argument("--stress_cmd") + parser.add_argument("--test_tiered_storage", action="store_true") + parser.add_argument("--cleanup_cmd") + + all_params = dict( + list(default_params.items()) + + list(blackbox_default_params.items()) + + list(whitebox_default_params.items()) + + list(simple_default_params.items()) + + list(blackbox_simple_default_params.items()) + + list(whitebox_simple_default_params.items()) + + list(blob_params.items()) + + list(ts_params.items()) + + list(multiops_txn_default_params.items()) + + list(multiops_wc_txn_params.items()) + + list(multiops_wp_txn_params.items()) + + list(best_efforts_recovery_params.items()) + + list(cf_consistency_params.items()) + + list(tiered_params.items()) + + list(txn_params.items()) + ) + + for k, v in all_params.items(): + parser.add_argument("--" + k, type=type(v() if callable(v) else v)) + # unknown_args are passed directly to db_stress + args, unknown_args = parser.parse_known_args() + + test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR) + if test_tmpdir is not None and not os.path.isdir(test_tmpdir): + print( + "%s env var is set to a non-existent directory: %s" + % (_TEST_DIR_ENV_VAR, test_tmpdir) + ) + sys.exit(1) + + if args.stress_cmd: + stress_cmd = args.stress_cmd + if args.cleanup_cmd: + cleanup_cmd = args.cleanup_cmd + if args.test_type == "blackbox": + blackbox_crash_main(args, unknown_args) + if args.test_type == "whitebox": + whitebox_crash_main(args, unknown_args) + # Only delete the `expected_values_dir` if test passes + if expected_values_dir is not None: + shutil.rmtree(expected_values_dir) + if multiops_txn_key_spaces_file is not None: + os.remove(multiops_txn_key_spaces_file) + + +if __name__ == "__main__": + main() diff --git a/src/rocksdb/tools/db_repl_stress.cc b/src/rocksdb/tools/db_repl_stress.cc new file mode 100644 index 000000000..ba680f4f2 --- /dev/null +++ b/src/rocksdb/tools/db_repl_stress.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include <atomic> +#include <cstdio> + +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/types.h" +#include "test_util/testutil.h" +#include "util/gflags_compat.h" + +// Run a thread to perform Put's. +// Another thread uses GetUpdatesSince API to keep getting the updates. +// options : +// --num_inserts = the num of inserts the first thread should perform. +// --wal_ttl = the wal ttl for the run. + +DEFINE_uint64(num_inserts, 1000, + "the num of inserts the first thread should" + " perform."); +DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)"); +DEFINE_uint64(wal_size_limit_MB, 10, + "the wal size limit for the run" + "(in MB)"); + +using ROCKSDB_NAMESPACE::BatchResult; +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::DestroyDB; +using ROCKSDB_NAMESPACE::Env; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::Random; +using ROCKSDB_NAMESPACE::SequenceNumber; +using ROCKSDB_NAMESPACE::Slice; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::TransactionLogIterator; +using ROCKSDB_NAMESPACE::WriteOptions; + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +using GFLAGS_NAMESPACE::SetUsageMessage; + +struct DataPumpThread { + DB* db; // Assumption DB is Open'ed already. +}; + +static void DataPumpThreadBody(void* arg) { + DataPumpThread* t = reinterpret_cast<DataPumpThread*>(arg); + DB* db = t->db; + Random rnd(301); + uint64_t i = 0; + while (i++ < FLAGS_num_inserts) { + if (!db->Put(WriteOptions(), Slice(rnd.RandomString(500)), + Slice(rnd.RandomString(500))) + .ok()) { + fprintf(stderr, "Error in put\n"); + exit(1); + } + } +} + +int main(int argc, const char** argv) { + SetUsageMessage( + std::string("\nUSAGE:\n") + std::string(argv[0]) + + " --num_inserts=<num_inserts> --wal_ttl_seconds=<WAL_ttl_seconds>" + + " --wal_size_limit_MB=<WAL_size_limit_MB>"); + ParseCommandLineFlags(&argc, const_cast<char***>(&argv), true); + + Env* env = Env::Default(); + std::string default_db_path; + env->GetTestDirectory(&default_db_path); + default_db_path += "db_repl_stress"; + Options options; + options.create_if_missing = true; + options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds; + options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB; + DB* db; + DestroyDB(default_db_path, options); + + Status s = DB::Open(options, default_db_path, &db); + + if (!s.ok()) { + fprintf(stderr, "Could not open DB due to %s\n", s.ToString().c_str()); + exit(1); + } + + DataPumpThread dataPump; + dataPump.db = db; + env->StartThread(DataPumpThreadBody, &dataPump); + + std::unique_ptr<TransactionLogIterator> iter; + SequenceNumber currentSeqNum = 1; + uint64_t num_read = 0; + for (;;) { + iter.reset(); + // Continue to probe a bit more after all received + size_t probes = 0; + while (!db->GetUpdatesSince(currentSeqNum, &iter).ok()) { + probes++; + if (probes > 100 && num_read >= FLAGS_num_inserts) { + if (num_read > FLAGS_num_inserts) { + fprintf(stderr, "Too many updates read: %ld expected: %ld\n", + (long)num_read, (long)FLAGS_num_inserts); + exit(1); + } + fprintf(stderr, "Successful!\n"); + return 0; + } + } + fprintf(stderr, "Refreshing iterator\n"); + for (; iter->Valid(); iter->Next(), num_read++, currentSeqNum++) { + BatchResult res = iter->GetBatch(); + if (res.sequence != currentSeqNum) { + fprintf(stderr, "Missed a seq no. b/w %ld and %ld\n", + (long)currentSeqNum, (long)res.sequence); + exit(1); + } + } + } +} + +#endif // GFLAGS + +#else // ROCKSDB_LITE +#include <stdio.h> +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "Not supported in lite mode.\n"); + return 1; +} +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/db_sanity_test.cc b/src/rocksdb/tools/db_sanity_test.cc new file mode 100644 index 000000000..8cc67f5d5 --- /dev/null +++ b/src/rocksdb/tools/db_sanity_test.cc @@ -0,0 +1,300 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <cstdio> +#include <cstdlib> +#include <memory> +#include <vector> + +#include "port/port.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class SanityTest { + public: + explicit SanityTest(const std::string& path) + : env_(Env::Default()), path_(path) { + env_->CreateDirIfMissing(path); + } + virtual ~SanityTest() {} + + virtual std::string Name() const = 0; + virtual Options GetOptions() const = 0; + + Status Create() { + Options options = GetOptions(); + options.create_if_missing = true; + std::string dbname = path_ + Name(); + Status s = DestroyDB(dbname, options); + if (!s.ok()) { + return s; + } + DB* db = nullptr; + s = DB::Open(options, dbname, &db); + std::unique_ptr<DB> db_guard(db); + if (!s.ok()) { + return s; + } + for (int i = 0; i < 1000000; ++i) { + std::string k = "key" + std::to_string(i); + std::string v = "value" + std::to_string(i); + s = db->Put(WriteOptions(), Slice(k), Slice(v)); + if (!s.ok()) { + return s; + } + } + return db->Flush(FlushOptions()); + } + Status Verify() { + DB* db = nullptr; + std::string dbname = path_ + Name(); + Status s = DB::Open(GetOptions(), dbname, &db); + std::unique_ptr<DB> db_guard(db); + if (!s.ok()) { + return s; + } + for (int i = 0; i < 1000000; ++i) { + std::string k = "key" + std::to_string(i); + std::string v = "value" + std::to_string(i); + std::string result; + s = db->Get(ReadOptions(), Slice(k), &result); + if (!s.ok()) { + return s; + } + if (result != v) { + return Status::Corruption("Unexpected value for key " + k); + } + } + return Status::OK(); + } + + private: + Env* env_; + std::string const path_; +}; + +class SanityTestBasic : public SanityTest { + public: + explicit SanityTestBasic(const std::string& path) : SanityTest(path) {} + virtual Options GetOptions() const override { + Options options; + options.create_if_missing = true; + return options; + } + virtual std::string Name() const override { return "Basic"; } +}; + +class SanityTestSpecialComparator : public SanityTest { + public: + explicit SanityTestSpecialComparator(const std::string& path) + : SanityTest(path) { + options_.comparator = new NewComparator(); + } + ~SanityTestSpecialComparator() { delete options_.comparator; } + virtual Options GetOptions() const override { return options_; } + virtual std::string Name() const override { return "SpecialComparator"; } + + private: + class NewComparator : public Comparator { + public: + virtual const char* Name() const override { + return "rocksdb.NewComparator"; + } + virtual int Compare(const Slice& a, const Slice& b) const override { + return BytewiseComparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* s, + const Slice& l) const override { + BytewiseComparator()->FindShortestSeparator(s, l); + } + virtual void FindShortSuccessor(std::string* key) const override { + BytewiseComparator()->FindShortSuccessor(key); + } + }; + Options options_; +}; + +class SanityTestZlibCompression : public SanityTest { + public: + explicit SanityTestZlibCompression(const std::string& path) + : SanityTest(path) { + options_.compression = kZlibCompression; + } + virtual Options GetOptions() const override { return options_; } + virtual std::string Name() const override { return "ZlibCompression"; } + + private: + Options options_; +}; + +class SanityTestZlibCompressionVersion2 : public SanityTest { + public: + explicit SanityTestZlibCompressionVersion2(const std::string& path) + : SanityTest(path) { + options_.compression = kZlibCompression; + BlockBasedTableOptions table_options; +#if ROCKSDB_MAJOR > 3 || (ROCKSDB_MAJOR == 3 && ROCKSDB_MINOR >= 10) + table_options.format_version = 2; +#endif + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } + virtual Options GetOptions() const override { return options_; } + virtual std::string Name() const override { + return "ZlibCompressionVersion2"; + } + + private: + Options options_; +}; + +class SanityTestLZ4Compression : public SanityTest { + public: + explicit SanityTestLZ4Compression(const std::string& path) + : SanityTest(path) { + options_.compression = kLZ4Compression; + } + virtual Options GetOptions() const override { return options_; } + virtual std::string Name() const override { return "LZ4Compression"; } + + private: + Options options_; +}; + +class SanityTestLZ4HCCompression : public SanityTest { + public: + explicit SanityTestLZ4HCCompression(const std::string& path) + : SanityTest(path) { + options_.compression = kLZ4HCCompression; + } + virtual Options GetOptions() const override { return options_; } + virtual std::string Name() const override { return "LZ4HCCompression"; } + + private: + Options options_; +}; + +class SanityTestZSTDCompression : public SanityTest { + public: + explicit SanityTestZSTDCompression(const std::string& path) + : SanityTest(path) { + options_.compression = kZSTD; + } + virtual Options GetOptions() const override { return options_; } + virtual std::string Name() const override { return "ZSTDCompression"; } + + private: + Options options_; +}; + +#ifndef ROCKSDB_LITE +class SanityTestPlainTableFactory : public SanityTest { + public: + explicit SanityTestPlainTableFactory(const std::string& path) + : SanityTest(path) { + options_.table_factory.reset(NewPlainTableFactory()); + options_.prefix_extractor.reset(NewFixedPrefixTransform(2)); + options_.allow_mmap_reads = true; + } + ~SanityTestPlainTableFactory() {} + virtual Options GetOptions() const override { return options_; } + virtual std::string Name() const override { return "PlainTable"; } + + private: + Options options_; +}; +#endif // ROCKSDB_LITE + +class SanityTestBloomFilter : public SanityTest { + public: + explicit SanityTestBloomFilter(const std::string& path) : SanityTest(path) { + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } + ~SanityTestBloomFilter() {} + virtual Options GetOptions() const override { return options_; } + virtual std::string Name() const override { return "BloomFilter"; } + + private: + Options options_; +}; + +namespace { +bool RunSanityTests(const std::string& command, const std::string& path) { + bool result = true; +// Suppress false positive clang static anaylzer warnings. +#ifndef __clang_analyzer__ + std::vector<SanityTest*> sanity_tests = { + new SanityTestBasic(path), + new SanityTestSpecialComparator(path), + new SanityTestZlibCompression(path), + new SanityTestZlibCompressionVersion2(path), + new SanityTestLZ4Compression(path), + new SanityTestLZ4HCCompression(path), + new SanityTestZSTDCompression(path), +#ifndef ROCKSDB_LITE + new SanityTestPlainTableFactory(path), +#endif // ROCKSDB_LITE + new SanityTestBloomFilter(path)}; + + if (command == "create") { + fprintf(stderr, "Creating...\n"); + } else { + fprintf(stderr, "Verifying...\n"); + } + for (auto sanity_test : sanity_tests) { + Status s; + fprintf(stderr, "%s -- ", sanity_test->Name().c_str()); + if (command == "create") { + s = sanity_test->Create(); + } else { + assert(command == "verify"); + s = sanity_test->Verify(); + } + fprintf(stderr, "%s\n", s.ToString().c_str()); + if (!s.ok()) { + fprintf(stderr, "FAIL\n"); + result = false; + } + + delete sanity_test; + } +#endif // __clang_analyzer__ + return result; +} +} // namespace + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + std::string path, command; + bool ok = (argc == 3); + if (ok) { + path = std::string(argv[1]); + command = std::string(argv[2]); + ok = (command == "create" || command == "verify"); + } + if (!ok) { + fprintf(stderr, "Usage: %s <path> [create|verify] \n", argv[0]); + exit(1); + } + if (path.back() != '/') { + path += "/"; + } + + bool sanity_ok = ROCKSDB_NAMESPACE::RunSanityTests(command, path); + + return sanity_ok ? 0 : 1; +} diff --git a/src/rocksdb/tools/dbench_monitor b/src/rocksdb/tools/dbench_monitor new file mode 100755 index 000000000..d85f9d070 --- /dev/null +++ b/src/rocksdb/tools/dbench_monitor @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +# +#(c) 2004-present, Facebook Inc. All rights reserved. +# +#see LICENSE file for more information on use/redistribution rights. +# + +# +#dbench_monitor: monitor db_bench process for violation of memory utilization +# +#default usage will monitor 'virtual memory size'. See below for standard options +#passed to db_bench during this test. +# +# See also: ./pflag for the actual monitoring script that does the work +# +#NOTE: +# You may end up with some /tmp/ files if db_bench OR +# this script OR ./pflag was killed unceremoniously +# +# If you see the script taking a long time, trying "kill" +# will usually cleanly exit. +# +# +DIR=`dirname $0` +LOG=/tmp/`basename $0`.$$ +DB_BENCH="$DIR/../db_bench"; +PFLAG=${DIR}/pflag + +usage() { + cat <<HELP; exit + +Usage: $0 [-h] + +-h: prints this help message + +This program will run the db_bench script to monitor memory usage +using the 'pflag' program. It launches db_bench with default settings +for certain arguments. You can change the defaults passed to +'db_bench' program, by setting the following environment +variables: + + bs [block_size] + ztype [compression_type] + benches [benchmarks] + reads [reads] + threads [threads] + cs [cache_size] + vsize [value_size] + comp [compression_ratio] + num [num] + +See the code for more info + +HELP + +} + +[ ! -x ${DB_BENCH} ] && echo "WARNING: ${DB_BENCH} doesn't exist, abort!" && exit -1; + +[ "x$1" = "x-h" ] && usage; + +trap 'rm -f ${LOG}; kill ${PID}; echo "Interrupted, exiting";' 1 2 3 15 + +touch $LOG; + +: ${bs:=16384} +: ${ztype:=zlib} +: ${benches:=readwhilewriting} +: ${reads:=$((1*1024*1024))}; +: ${threads:=8} +: ${vsize:=2000} +: ${comp:=0.5} +: ${num:=10000} +: ${cs:=$((1*1024*1024*1024))}; + +DEBUG=1 #Set to 0 to remove chattiness + + +if [ "x$DEBUG" != "x" ]; then + # + #NOTE: under some circumstances, --use_existing_db may leave LOCK files under ${TMPDIR}/rocksdb/* + #cleanup the dir and re-run + # + echo DEBUG: Will run $DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db + +fi + +$DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db >$LOG 2>&1 & + +if [ $? -ne 0 ]; then + warn "WARNING: ${DB_BENCH} did not launch successfully! Abort!"; + exit; +fi +PID=$! + +# +#Start the monitoring. Default is "vsz" monitoring for upto cache_size ($cs) value of virtual mem +#You could also monitor RSS and CPUTIME (bsdtime). Try 'pflag -h' for how to do this +# +${PFLAG} -p $PID -v + +rm -f $LOG; diff --git a/src/rocksdb/tools/dump/db_dump_tool.cc b/src/rocksdb/tools/dump/db_dump_tool.cc new file mode 100644 index 000000000..427a54d99 --- /dev/null +++ b/src/rocksdb/tools/dump/db_dump_tool.cc @@ -0,0 +1,260 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "rocksdb/db_dump_tool.h" + +#include <cinttypes> +#include <iostream> + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +bool DbDumpTool::Run(const DumpOptions& dump_options, + ROCKSDB_NAMESPACE::Options options) { + ROCKSDB_NAMESPACE::DB* dbptr; + ROCKSDB_NAMESPACE::Status status; + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> dumpfile; + char hostname[1024]; + int64_t timesec = 0; + std::string abspath; + char json[4096]; + + static const char* magicstr = "ROCKDUMP"; + static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1}; + + ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default(); + + // Open the database + options.create_if_missing = false; + status = ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, dump_options.db_path, + &dbptr); + if (!status.ok()) { + std::cerr << "Unable to open database '" << dump_options.db_path + << "' for reading: " << status.ToString() << std::endl; + return false; + } + + const std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(dbptr); + + status = env->NewWritableFile(dump_options.dump_location, &dumpfile, + ROCKSDB_NAMESPACE::EnvOptions()); + if (!status.ok()) { + std::cerr << "Unable to open dump file '" << dump_options.dump_location + << "' for writing: " << status.ToString() << std::endl; + return false; + } + + ROCKSDB_NAMESPACE::Slice magicslice(magicstr, 8); + status = dumpfile->Append(magicslice); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + + ROCKSDB_NAMESPACE::Slice versionslice(versionstr, 8); + status = dumpfile->Append(versionslice); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + + if (dump_options.anonymous) { + snprintf(json, sizeof(json), "{}"); + } else { + status = env->GetHostName(hostname, sizeof(hostname)); + status = env->GetCurrentTime(×ec); + status = env->GetAbsolutePath(dump_options.db_path, &abspath); + snprintf(json, sizeof(json), + "{ \"database-path\": \"%s\", \"hostname\": \"%s\", " + "\"creation-time\": %" PRIi64 " }", + abspath.c_str(), hostname, timesec); + } + + ROCKSDB_NAMESPACE::Slice infoslice(json, strlen(json)); + char infosize[4]; + ROCKSDB_NAMESPACE::EncodeFixed32(infosize, (uint32_t)infoslice.size()); + ROCKSDB_NAMESPACE::Slice infosizeslice(infosize, 4); + status = dumpfile->Append(infosizeslice); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + status = dumpfile->Append(infoslice); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + + const std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it( + db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions())); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + char keysize[4]; + ROCKSDB_NAMESPACE::EncodeFixed32(keysize, (uint32_t)it->key().size()); + ROCKSDB_NAMESPACE::Slice keysizeslice(keysize, 4); + status = dumpfile->Append(keysizeslice); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + status = dumpfile->Append(it->key()); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + + char valsize[4]; + ROCKSDB_NAMESPACE::EncodeFixed32(valsize, (uint32_t)it->value().size()); + ROCKSDB_NAMESPACE::Slice valsizeslice(valsize, 4); + status = dumpfile->Append(valsizeslice); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + status = dumpfile->Append(it->value()); + if (!status.ok()) { + std::cerr << "Append failed: " << status.ToString() << std::endl; + return false; + } + } + if (!it->status().ok()) { + std::cerr << "Database iteration failed: " << status.ToString() + << std::endl; + return false; + } + return true; +} + +bool DbUndumpTool::Run(const UndumpOptions& undump_options, + ROCKSDB_NAMESPACE::Options options) { + ROCKSDB_NAMESPACE::DB* dbptr; + ROCKSDB_NAMESPACE::Status status; + ROCKSDB_NAMESPACE::Env* env; + std::unique_ptr<ROCKSDB_NAMESPACE::SequentialFile> dumpfile; + ROCKSDB_NAMESPACE::Slice slice; + char scratch8[8]; + + static const char* magicstr = "ROCKDUMP"; + static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1}; + + env = ROCKSDB_NAMESPACE::Env::Default(); + + status = env->NewSequentialFile(undump_options.dump_location, &dumpfile, + ROCKSDB_NAMESPACE::EnvOptions()); + if (!status.ok()) { + std::cerr << "Unable to open dump file '" << undump_options.dump_location + << "' for reading: " << status.ToString() << std::endl; + return false; + } + + status = dumpfile->Read(8, &slice, scratch8); + if (!status.ok() || slice.size() != 8 || + memcmp(slice.data(), magicstr, 8) != 0) { + std::cerr << "File '" << undump_options.dump_location + << "' is not a recognizable dump file." << std::endl; + return false; + } + + status = dumpfile->Read(8, &slice, scratch8); + if (!status.ok() || slice.size() != 8 || + memcmp(slice.data(), versionstr, 8) != 0) { + std::cerr << "File '" << undump_options.dump_location + << "' version not recognized." << std::endl; + return false; + } + + status = dumpfile->Read(4, &slice, scratch8); + if (!status.ok() || slice.size() != 4) { + std::cerr << "Unable to read info blob size." << std::endl; + return false; + } + uint32_t infosize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data()); + status = dumpfile->Skip(infosize); + if (!status.ok()) { + std::cerr << "Unable to skip info blob: " << status.ToString() << std::endl; + return false; + } + + options.create_if_missing = true; + status = ROCKSDB_NAMESPACE::DB::Open(options, undump_options.db_path, &dbptr); + if (!status.ok()) { + std::cerr << "Unable to open database '" << undump_options.db_path + << "' for writing: " << status.ToString() << std::endl; + return false; + } + + const std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(dbptr); + + uint32_t last_keysize = 64; + size_t last_valsize = 1 << 20; + std::unique_ptr<char[]> keyscratch(new char[last_keysize]); + std::unique_ptr<char[]> valscratch(new char[last_valsize]); + + while (1) { + uint32_t keysize, valsize; + ROCKSDB_NAMESPACE::Slice keyslice; + ROCKSDB_NAMESPACE::Slice valslice; + + status = dumpfile->Read(4, &slice, scratch8); + if (!status.ok() || slice.size() != 4) break; + keysize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data()); + if (keysize > last_keysize) { + while (keysize > last_keysize) last_keysize *= 2; + keyscratch = std::unique_ptr<char[]>(new char[last_keysize]); + } + + status = dumpfile->Read(keysize, &keyslice, keyscratch.get()); + if (!status.ok() || keyslice.size() != keysize) { + std::cerr << "Key read failure: " + << (status.ok() ? "insufficient data" : status.ToString()) + << std::endl; + return false; + } + + status = dumpfile->Read(4, &slice, scratch8); + if (!status.ok() || slice.size() != 4) { + std::cerr << "Unable to read value size: " + << (status.ok() ? "insufficient data" : status.ToString()) + << std::endl; + return false; + } + valsize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data()); + if (valsize > last_valsize) { + while (valsize > last_valsize) last_valsize *= 2; + valscratch = std::unique_ptr<char[]>(new char[last_valsize]); + } + + status = dumpfile->Read(valsize, &valslice, valscratch.get()); + if (!status.ok() || valslice.size() != valsize) { + std::cerr << "Unable to read value: " + << (status.ok() ? "insufficient data" : status.ToString()) + << std::endl; + return false; + } + + status = db->Put(ROCKSDB_NAMESPACE::WriteOptions(), keyslice, valslice); + if (!status.ok()) { + fprintf(stderr, "Unable to write database entry\n"); + return false; + } + } + + if (undump_options.compact_db) { + status = db->CompactRange(ROCKSDB_NAMESPACE::CompactRangeOptions(), nullptr, + nullptr); + if (!status.ok()) { + fprintf(stderr, + "Unable to compact the database after loading the dumped file\n"); + return false; + } + } + return true; +} +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/dump/rocksdb_dump.cc b/src/rocksdb/tools/dump/rocksdb_dump.cc new file mode 100644 index 000000000..358457e92 --- /dev/null +++ b/src/rocksdb/tools/dump/rocksdb_dump.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#if !(defined GFLAGS) || defined(ROCKSDB_LITE) + +#include <cstdio> +int main() { +#ifndef GFLAGS + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); +#endif +#ifdef ROCKSDB_LITE + fprintf(stderr, "DbDumpTool is not supported in ROCKSDB_LITE\n"); +#endif + return 1; +} + +#else + +#include "rocksdb/convenience.h" +#include "rocksdb/db_dump_tool.h" +#include "util/gflags_compat.h" + +DEFINE_string(db_path, "", "Path to the db that will be dumped"); +DEFINE_string(dump_location, "", "Path to where the dump file location"); +DEFINE_bool(anonymous, false, + "Remove information like db path, creation time from dumped file"); +DEFINE_string(db_options, "", + "Options string used to open the database that will be dumped"); + +int main(int argc, char** argv) { + GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_db_path == "" || FLAGS_dump_location == "") { + fprintf(stderr, "Please set --db_path and --dump_location\n"); + return 1; + } + + ROCKSDB_NAMESPACE::DumpOptions dump_options; + dump_options.db_path = FLAGS_db_path; + dump_options.dump_location = FLAGS_dump_location; + dump_options.anonymous = FLAGS_anonymous; + + ROCKSDB_NAMESPACE::Options db_options; + if (FLAGS_db_options != "") { + ROCKSDB_NAMESPACE::Options parsed_options; + ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::GetOptionsFromString( + db_options, FLAGS_db_options, &parsed_options); + if (!s.ok()) { + fprintf(stderr, "Cannot parse provided db_options\n"); + return 1; + } + db_options = parsed_options; + } + + ROCKSDB_NAMESPACE::DbDumpTool tool; + if (!tool.Run(dump_options, db_options)) { + return 1; + } + return 0; +} +#endif // !(defined GFLAGS) || defined(ROCKSDB_LITE) diff --git a/src/rocksdb/tools/dump/rocksdb_undump.cc b/src/rocksdb/tools/dump/rocksdb_undump.cc new file mode 100644 index 000000000..2ff128548 --- /dev/null +++ b/src/rocksdb/tools/dump/rocksdb_undump.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#if !(defined GFLAGS) || defined(ROCKSDB_LITE) + +#include <cstdio> +int main() { +#ifndef GFLAGS + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); +#endif +#ifdef ROCKSDB_LITE + fprintf(stderr, "DbUndumpTool is not supported in ROCKSDB_LITE\n"); +#endif + return 1; +} + +#else + +#include "rocksdb/convenience.h" +#include "rocksdb/db_dump_tool.h" +#include "util/gflags_compat.h" + +DEFINE_string(dump_location, "", "Path to the dump file that will be loaded"); +DEFINE_string(db_path, "", "Path to the db that we will undump the file into"); +DEFINE_bool(compact, false, "Compact the db after loading the dumped file"); +DEFINE_string(db_options, "", + "Options string used to open the database that will be loaded"); + +int main(int argc, char **argv) { + GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_db_path == "" || FLAGS_dump_location == "") { + fprintf(stderr, "Please set --db_path and --dump_location\n"); + return 1; + } + + ROCKSDB_NAMESPACE::UndumpOptions undump_options; + undump_options.db_path = FLAGS_db_path; + undump_options.dump_location = FLAGS_dump_location; + undump_options.compact_db = FLAGS_compact; + + ROCKSDB_NAMESPACE::Options db_options; + if (FLAGS_db_options != "") { + ROCKSDB_NAMESPACE::Options parsed_options; + ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::GetOptionsFromString( + db_options, FLAGS_db_options, &parsed_options); + if (!s.ok()) { + fprintf(stderr, "Cannot parse provided db_options\n"); + return 1; + } + db_options = parsed_options; + } + + ROCKSDB_NAMESPACE::DbUndumpTool tool; + if (!tool.Run(undump_options, db_options)) { + return 1; + } + return 0; +} +#endif // !(defined GFLAGS) || defined(ROCKSDB_LITE) diff --git a/src/rocksdb/tools/generate_random_db.sh b/src/rocksdb/tools/generate_random_db.sh new file mode 100755 index 000000000..5b5962617 --- /dev/null +++ b/src/rocksdb/tools/generate_random_db.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# A shell script to load some pre generated data file to a DB using ldb tool +# ./ldb needs to be avaible to be executed. +# +# Usage: <SCRIPT> <input_data_path> <DB Path> + +if [ "$#" -lt 2 ]; then + echo "usage: $BASH_SOURCE <input_data_path> <DB Path>" + exit 1 +fi + +input_data_dir=$1 +db_dir=$2 +rm -rf $db_dir + +echo == Loading data from $input_data_dir to $db_dir + +declare -a compression_opts=("no" "snappy" "zlib" "bzip2") + +set -e + +n=0 + +for f in `ls -1 $input_data_dir` +do + echo == Loading $f with compression ${compression_opts[n % 4]} + ./ldb load --db=$db_dir --compression_type=${compression_opts[n % 4]} --bloom_bits=10 --auto_compaction=false --create_if_missing < $input_data_dir/$f + let "n = n + 1" +done diff --git a/src/rocksdb/tools/ingest_external_sst.sh b/src/rocksdb/tools/ingest_external_sst.sh new file mode 100755 index 000000000..8e2fed7ce --- /dev/null +++ b/src/rocksdb/tools/ingest_external_sst.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# + +if [ "$#" -lt 2 ]; then + echo "usage: $BASH_SOURCE <DB Path> <External SST Dir>" + exit 1 +fi + +db_dir=$1 +external_sst_dir=$2 + +for f in `find $external_sst_dir -name extern_sst*` +do + echo == Ingesting external SST file $f to DB at $db_dir + ./ldb --db=$db_dir --create_if_missing ingest_extern_sst $f +done diff --git a/src/rocksdb/tools/io_tracer_parser.cc b/src/rocksdb/tools/io_tracer_parser.cc new file mode 100644 index 000000000..41ef45d97 --- /dev/null +++ b/src/rocksdb/tools/io_tracer_parser.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else // GFLAGS +#include "tools/io_tracer_parser_tool.h" +int main(int argc, char** argv) { + return ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv); +} +#endif // GFLAGS +#else // ROCKSDB_LITE +#include <stdio.h> +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "Not supported in lite mode.\n"); + return 1; +} +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/io_tracer_parser_test.cc b/src/rocksdb/tools/io_tracer_parser_test.cc new file mode 100644 index 000000000..41be5fa96 --- /dev/null +++ b/src/rocksdb/tools/io_tracer_parser_test.cc @@ -0,0 +1,190 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#ifndef ROCKSDB_LITE +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, "Please install gflags to run io_tracer_parser_test\n"); + return 0; +} +#else + +#include <string> +#include <vector> + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_reader_writer.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "tools/io_tracer_parser_tool.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +const int kMaxArgCount = 100; +const size_t kArgBufferSize = 100000; +} // namespace + +class IOTracerParserTest : public testing::Test { + public: + IOTracerParserTest() { + test_path_ = test::PerThreadDBPath("io_tracer_parser_test"); + env_ = ROCKSDB_NAMESPACE::Env::Default(); + EXPECT_OK(env_->CreateDirIfMissing(test_path_)); + trace_file_path_ = test_path_ + "/io_trace_file"; + dbname_ = test_path_ + "/db"; + Options options; + options.create_if_missing = true; + EXPECT_OK(DB::Open(options, dbname_, &db_)); + } + + ~IOTracerParserTest() { + if (env_->FileExists(trace_file_path_).ok()) { + EXPECT_OK(env_->DeleteFile(trace_file_path_)); + } + if (db_ != nullptr) { + Options options; + options.env = env_; + delete db_; + db_ = nullptr; + EXPECT_OK(DestroyDB(dbname_, options)); + } + EXPECT_OK(env_->DeleteDir(test_path_)); + } + + void GenerateIOTrace() { + WriteOptions write_opt; + TraceOptions trace_opt; + std::unique_ptr<TraceWriter> trace_writer; + + ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, + &trace_writer)); + + ASSERT_OK(db_->StartIOTrace(trace_opt, std::move(trace_writer))); + + for (int i = 0; i < 10; i++) { + ASSERT_OK(db_->Put(write_opt, "key_" + std::to_string(i), + "value_" + std::to_string(i))); + ASSERT_OK(db_->Flush(FlushOptions())); + } + + ASSERT_OK(db_->EndIOTrace()); + ASSERT_OK(env_->FileExists(trace_file_path_)); + } + + void RunIOTracerParserTool() { + std::vector<std::string> params = {"./io_tracer_parser", + "-io_trace_file=" + trace_file_path_}; + + char arg_buffer[kArgBufferSize]; + char* argv[kMaxArgCount]; + int argc = 0; + int cursor = 0; + for (const auto& arg : params) { + ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize); + ASSERT_LE(argc + 1, kMaxArgCount); + + snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str()); + + argv[argc++] = arg_buffer + cursor; + cursor += static_cast<int>(arg.size()) + 1; + } + ASSERT_EQ(0, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv)); + } + + DB* db_; + Env* env_; + EnvOptions env_options_; + std::string trace_file_path_; + std::string output_file_; + std::string test_path_; + std::string dbname_; +}; + +TEST_F(IOTracerParserTest, InvalidArguments) { + { + std::vector<std::string> params = {"./io_tracer_parser"}; + char arg_buffer[kArgBufferSize]; + char* argv[kMaxArgCount]; + int argc = 0; + int cursor = 0; + for (const auto& arg : params) { + ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize); + ASSERT_LE(argc + 1, kMaxArgCount); + + snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str()); + + argv[argc++] = arg_buffer + cursor; + cursor += static_cast<int>(arg.size()) + 1; + } + ASSERT_EQ(1, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv)); + } +} + +TEST_F(IOTracerParserTest, DumpAndParseIOTraceRecords) { + GenerateIOTrace(); + RunIOTracerParserTool(); +} + +TEST_F(IOTracerParserTest, NoRecordingAfterEndIOTrace) { + uint64_t file_size = 0; + // Generate IO trace records and parse them. + { + GenerateIOTrace(); + RunIOTracerParserTool(); + ASSERT_OK(env_->GetFileSize(trace_file_path_, &file_size)); + } + // Once DB::EndIOTrace is invoked in GenerateIOTrace(), no new records should + // be appended. + { + WriteOptions write_opt; + for (int i = 10; i < 20; i++) { + ASSERT_OK(db_->Put(write_opt, "key_" + std::to_string(i), + "value_" + std::to_string(i))); + ASSERT_OK(db_->Flush(FlushOptions())); + } + } + + uint64_t new_file_size = 0; + ASSERT_OK(env_->GetFileSize(trace_file_path_, &new_file_size)); + ASSERT_EQ(file_size, new_file_size); +} + +TEST_F(IOTracerParserTest, NoRecordingBeforeStartIOTrace) { + { + WriteOptions write_opt; + for (int i = 10; i < 20; i++) { + ASSERT_OK(db_->Put(write_opt, "key_" + std::to_string(i), + "value_" + std::to_string(i))); + ASSERT_OK(db_->Flush(FlushOptions())); + } + // IO trace file doesn't exist + ASSERT_NOK(env_->FileExists(trace_file_path_)); + } + // Generate IO trace records and parse them. + { + GenerateIOTrace(); + RunIOTracerParserTool(); + } +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +#endif // GFLAGS +#else +#include <stdio.h> +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "io_tracer_parser_test is not supported in ROCKSDB_LITE\n"); + return 0; +} +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/io_tracer_parser_tool.cc b/src/rocksdb/tools/io_tracer_parser_tool.cc new file mode 100644 index 000000000..01b920f3b --- /dev/null +++ b/src/rocksdb/tools/io_tracer_parser_tool.cc @@ -0,0 +1,144 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#ifdef GFLAGS +#include "tools/io_tracer_parser_tool.h" + +#include <cinttypes> +#include <cstdio> +#include <iomanip> +#include <memory> +#include <sstream> + +#include "port/lang.h" +#include "rocksdb/trace_reader_writer.h" +#include "trace_replay/io_tracer.h" +#include "util/gflags_compat.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_string(io_trace_file, "", "The IO trace file path."); + +namespace ROCKSDB_NAMESPACE { + +IOTraceRecordParser::IOTraceRecordParser(const std::string& input_file) + : input_file_(input_file) {} + +void IOTraceRecordParser::PrintHumanReadableHeader( + const IOTraceHeader& header) { + std::stringstream ss; + ss << "Start Time: " << header.start_time + << "\nRocksDB Major Version: " << header.rocksdb_major_version + << "\nRocksDB Minor Version: " << header.rocksdb_minor_version << "\n"; + fprintf(stdout, "%s", ss.str().c_str()); +} + +void IOTraceRecordParser::PrintHumanReadableIOTraceRecord( + const IOTraceRecord& record) { + std::stringstream ss; + ss << "Access Time : " << std::setw(20) << std::left + << record.access_timestamp << ", File Name: " << std::setw(20) << std::left + << record.file_name.c_str() << ", File Operation: " << std::setw(18) + << std::left << record.file_operation.c_str() + << ", Latency: " << std::setw(10) << std::left << record.latency + << ", IO Status: " << record.io_status.c_str(); + + // Each bit in io_op_data stores which corresponding info from IOTraceOp will + // be added in the trace. Foreg, if bit at position 1 is set then + // IOTraceOp::kIOLen (length) will be logged in the record (Since + // IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in + // io_op_data one by one and, update corresponsing info in the trace record, + // unset that bit to find other set bits until io_op_data = 0. + /* Read remaining options based on io_op_data set by file operation */ + int64_t io_op_data = static_cast<int64_t>(record.io_op_data); + while (io_op_data) { + // Find the rightmost set bit. + uint32_t set_pos = static_cast<uint32_t>(log2(io_op_data & -io_op_data)); + switch (set_pos) { + case IOTraceOp::kIOFileSize: + ss << ", File Size: " << record.file_size; + break; + case IOTraceOp::kIOLen: + ss << ", Length: " << record.len; + break; + case IOTraceOp::kIOOffset: + ss << ", Offset: " << record.offset; + break; + default: + assert(false); + } + // unset the rightmost bit. + io_op_data &= (io_op_data - 1); + } + + int64_t trace_data = static_cast<int64_t>(record.trace_data); + while (trace_data) { + // Find the rightmost set bit. + uint32_t set_pos = static_cast<uint32_t>(log2(trace_data & -trace_data)); + switch (set_pos) { + case IODebugContext::TraceData::kRequestID: + ss << ", Request Id: " << record.request_id; + break; + default: + assert(false); + } + // unset the rightmost bit. + trace_data &= (trace_data - 1); + } + + ss << "\n"; + fprintf(stdout, "%s", ss.str().c_str()); +} + +int IOTraceRecordParser::ReadIOTraceRecords() { + Status status; + Env* env(Env::Default()); + std::unique_ptr<TraceReader> trace_reader; + std::unique_ptr<IOTraceReader> io_trace_reader; + + status = NewFileTraceReader(env, EnvOptions(), input_file_, &trace_reader); + if (!status.ok()) { + fprintf(stderr, "%s: %s\n", input_file_.c_str(), status.ToString().c_str()); + return 1; + } + io_trace_reader.reset(new IOTraceReader(std::move(trace_reader))); + + // Read the header and dump it in a file. + IOTraceHeader header; + status = io_trace_reader->ReadHeader(&header); + if (!status.ok()) { + fprintf(stderr, "%s: %s\n", input_file_.c_str(), status.ToString().c_str()); + return 1; + } + PrintHumanReadableHeader(header); + + // Read the records one by one and print them in human readable format. + while (status.ok()) { + IOTraceRecord record; + status = io_trace_reader->ReadIOOp(&record); + if (!status.ok()) { + break; + } + PrintHumanReadableIOTraceRecord(record); + } + return 0; +} + +int io_tracer_parser(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_io_trace_file.empty()) { + fprintf(stderr, "IO Trace file path is empty\n"); + return 1; + } + + IOTraceRecordParser io_tracer_parser(FLAGS_io_trace_file); + return io_tracer_parser.ReadIOTraceRecords(); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // GFLAGS +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/io_tracer_parser_tool.h b/src/rocksdb/tools/io_tracer_parser_tool.h new file mode 100644 index 000000000..6c22c8f89 --- /dev/null +++ b/src/rocksdb/tools/io_tracer_parser_tool.h @@ -0,0 +1,40 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#pragma once + +#include <memory> + +#include "rocksdb/env.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +struct IOTraceHeader; +struct IOTraceRecord; + +// IOTraceRecordParser class reads the IO trace file (in binary format) and +// dumps the human readable records in output_file_. +class IOTraceRecordParser { + public: + explicit IOTraceRecordParser(const std::string& input_file); + + // ReadIOTraceRecords reads the binary trace file records one by one and + // invoke PrintHumanReadableIOTraceRecord to dump the records in output_file_. + int ReadIOTraceRecords(); + + private: + void PrintHumanReadableHeader(const IOTraceHeader& header); + void PrintHumanReadableIOTraceRecord(const IOTraceRecord& record); + + // Binary file that contains IO trace records. + std::string input_file_; +}; + +int io_tracer_parser(int argc, char** argv); + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/ldb.cc b/src/rocksdb/tools/ldb.cc new file mode 100644 index 000000000..482383be8 --- /dev/null +++ b/src/rocksdb/tools/ldb.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE + +#include "rocksdb/ldb_tool.h" + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::LDBTool tool; + tool.Run(argc, argv); + return 0; +} +#else +#include <stdio.h> +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "Not supported in lite mode.\n"); + return 1; +} +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/ldb_cmd.cc b/src/rocksdb/tools/ldb_cmd.cc new file mode 100644 index 000000000..ecd2d2977 --- /dev/null +++ b/src/rocksdb/tools/ldb_cmd.cc @@ -0,0 +1,4263 @@ + +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE +#include "rocksdb/utilities/ldb_cmd.h" + +#include <cinttypes> +#include <cstdlib> +#include <ctime> +#include <fstream> +#include <functional> +#include <iostream> +#include <limits> +#include <sstream> +#include <stdexcept> +#include <string> + +#include "db/blob/blob_index.h" +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/log_reader.h" +#include "db/version_util.h" +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "rocksdb/cache.h" +#include "rocksdb/experimental.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/utilities/backup_engine.h" +#include "rocksdb/utilities/checkpoint.h" +#include "rocksdb/utilities/debug.h" +#include "rocksdb/utilities/options_util.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/scoped_arena_iterator.h" +#include "table/sst_file_dumper.h" +#include "tools/ldb_cmd_impl.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/file_checksum_helper.h" +#include "util/stderr_logger.h" +#include "util/string_util.h" +#include "utilities/blob_db/blob_dump_tool.h" +#include "utilities/merge_operators.h" +#include "utilities/ttl/db_ttl_impl.h" + +namespace ROCKSDB_NAMESPACE { + +class FileChecksumGenCrc32c; +class FileChecksumGenCrc32cFactory; + +const std::string LDBCommand::ARG_ENV_URI = "env_uri"; +const std::string LDBCommand::ARG_FS_URI = "fs_uri"; +const std::string LDBCommand::ARG_DB = "db"; +const std::string LDBCommand::ARG_PATH = "path"; +const std::string LDBCommand::ARG_SECONDARY_PATH = "secondary_path"; +const std::string LDBCommand::ARG_HEX = "hex"; +const std::string LDBCommand::ARG_KEY_HEX = "key_hex"; +const std::string LDBCommand::ARG_VALUE_HEX = "value_hex"; +const std::string LDBCommand::ARG_CF_NAME = "column_family"; +const std::string LDBCommand::ARG_TTL = "ttl"; +const std::string LDBCommand::ARG_TTL_START = "start_time"; +const std::string LDBCommand::ARG_TTL_END = "end_time"; +const std::string LDBCommand::ARG_TIMESTAMP = "timestamp"; +const std::string LDBCommand::ARG_TRY_LOAD_OPTIONS = "try_load_options"; +const std::string LDBCommand::ARG_DISABLE_CONSISTENCY_CHECKS = + "disable_consistency_checks"; +const std::string LDBCommand::ARG_IGNORE_UNKNOWN_OPTIONS = + "ignore_unknown_options"; +const std::string LDBCommand::ARG_FROM = "from"; +const std::string LDBCommand::ARG_TO = "to"; +const std::string LDBCommand::ARG_MAX_KEYS = "max_keys"; +const std::string LDBCommand::ARG_BLOOM_BITS = "bloom_bits"; +const std::string LDBCommand::ARG_FIX_PREFIX_LEN = "fix_prefix_len"; +const std::string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type"; +const std::string LDBCommand::ARG_COMPRESSION_MAX_DICT_BYTES = + "compression_max_dict_bytes"; +const std::string LDBCommand::ARG_BLOCK_SIZE = "block_size"; +const std::string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction"; +const std::string LDBCommand::ARG_DB_WRITE_BUFFER_SIZE = "db_write_buffer_size"; +const std::string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size"; +const std::string LDBCommand::ARG_FILE_SIZE = "file_size"; +const std::string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing"; +const std::string LDBCommand::ARG_NO_VALUE = "no_value"; +const std::string LDBCommand::ARG_ENABLE_BLOB_FILES = "enable_blob_files"; +const std::string LDBCommand::ARG_MIN_BLOB_SIZE = "min_blob_size"; +const std::string LDBCommand::ARG_BLOB_FILE_SIZE = "blob_file_size"; +const std::string LDBCommand::ARG_BLOB_COMPRESSION_TYPE = + "blob_compression_type"; +const std::string LDBCommand::ARG_ENABLE_BLOB_GARBAGE_COLLECTION = + "enable_blob_garbage_collection"; +const std::string LDBCommand::ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF = + "blob_garbage_collection_age_cutoff"; +const std::string LDBCommand::ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD = + "blob_garbage_collection_force_threshold"; +const std::string LDBCommand::ARG_BLOB_COMPACTION_READAHEAD_SIZE = + "blob_compaction_readahead_size"; +const std::string LDBCommand::ARG_BLOB_FILE_STARTING_LEVEL = + "blob_file_starting_level"; +const std::string LDBCommand::ARG_PREPOPULATE_BLOB_CACHE = + "prepopulate_blob_cache"; +const std::string LDBCommand::ARG_DECODE_BLOB_INDEX = "decode_blob_index"; +const std::string LDBCommand::ARG_DUMP_UNCOMPRESSED_BLOBS = + "dump_uncompressed_blobs"; + +const char* LDBCommand::DELIM = " ==> "; + +namespace { + +void DumpWalFile(Options options, std::string wal_file, bool print_header, + bool print_values, bool is_write_committed, + LDBCommandExecuteResult* exec_state); + +void DumpSstFile(Options options, std::string filename, bool output_hex, + bool show_properties, bool decode_blob_index, + std::string from_key = "", std::string to_key = ""); + +void DumpBlobFile(const std::string& filename, bool is_key_hex, + bool is_value_hex, bool dump_uncompressed_blobs); +}; // namespace + +LDBCommand* LDBCommand::InitFromCmdLineArgs( + int argc, char const* const* argv, const Options& options, + const LDBOptions& ldb_options, + const std::vector<ColumnFamilyDescriptor>* column_families) { + std::vector<std::string> args; + for (int i = 1; i < argc; i++) { + args.push_back(argv[i]); + } + return InitFromCmdLineArgs(args, options, ldb_options, column_families, + SelectCommand); +} + +/** + * Parse the command-line arguments and create the appropriate LDBCommand2 + * instance. + * The command line arguments must be in the following format: + * ./ldb --db=PATH_TO_DB [--commonOpt1=commonOpt1Val] .. + * COMMAND <PARAM1> <PARAM2> ... [-cmdSpecificOpt1=cmdSpecificOpt1Val] .. + * This is similar to the command line format used by HBaseClientTool. + * Command name is not included in args. + * Returns nullptr if the command-line cannot be parsed. + */ +LDBCommand* LDBCommand::InitFromCmdLineArgs( + const std::vector<std::string>& args, const Options& options, + const LDBOptions& ldb_options, + const std::vector<ColumnFamilyDescriptor>* /*column_families*/, + const std::function<LDBCommand*(const ParsedParams&)>& selector) { + // --x=y command line arguments are added as x->y map entries in + // parsed_params.option_map. + // + // Command-line arguments of the form --hex end up in this array as hex to + // parsed_params.flags + ParsedParams parsed_params; + + // Everything other than option_map and flags. Represents commands + // and their parameters. For eg: put key1 value1 go into this vector. + std::vector<std::string> cmdTokens; + + const std::string OPTION_PREFIX = "--"; + + for (const auto& arg : args) { + if (arg[0] == '-' && arg[1] == '-') { + std::vector<std::string> splits = StringSplit(arg, '='); + // --option_name=option_value + if (splits.size() == 2) { + std::string optionKey = splits[0].substr(OPTION_PREFIX.size()); + parsed_params.option_map[optionKey] = splits[1]; + } else if (splits.size() == 1) { + // --flag_name + std::string optionKey = splits[0].substr(OPTION_PREFIX.size()); + parsed_params.flags.push_back(optionKey); + } else { + // --option_name=option_value, option_value contains '=' + std::string optionKey = splits[0].substr(OPTION_PREFIX.size()); + parsed_params.option_map[optionKey] = + arg.substr(splits[0].length() + 1); + } + } else { + cmdTokens.push_back(arg); + } + } + + if (cmdTokens.size() < 1) { + fprintf(stderr, "Command not specified!"); + return nullptr; + } + + parsed_params.cmd = cmdTokens[0]; + parsed_params.cmd_params.assign(cmdTokens.begin() + 1, cmdTokens.end()); + + LDBCommand* command = selector(parsed_params); + + if (command) { + command->SetDBOptions(options); + command->SetLDBOptions(ldb_options); + } + return command; +} + +LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) { + if (parsed_params.cmd == GetCommand::Name()) { + return new GetCommand(parsed_params.cmd_params, parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == PutCommand::Name()) { + return new PutCommand(parsed_params.cmd_params, parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == BatchPutCommand::Name()) { + return new BatchPutCommand(parsed_params.cmd_params, + parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == ScanCommand::Name()) { + return new ScanCommand(parsed_params.cmd_params, parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == DeleteCommand::Name()) { + return new DeleteCommand(parsed_params.cmd_params, parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == SingleDeleteCommand::Name()) { + return new SingleDeleteCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == DeleteRangeCommand::Name()) { + return new DeleteRangeCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == ApproxSizeCommand::Name()) { + return new ApproxSizeCommand(parsed_params.cmd_params, + parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == DBQuerierCommand::Name()) { + return new DBQuerierCommand(parsed_params.cmd_params, + parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == CompactorCommand::Name()) { + return new CompactorCommand(parsed_params.cmd_params, + parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == WALDumperCommand::Name()) { + return new WALDumperCommand(parsed_params.cmd_params, + parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == ReduceDBLevelsCommand::Name()) { + return new ReduceDBLevelsCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == ChangeCompactionStyleCommand::Name()) { + return new ChangeCompactionStyleCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == DBDumperCommand::Name()) { + return new DBDumperCommand(parsed_params.cmd_params, + parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == DBLoaderCommand::Name()) { + return new DBLoaderCommand(parsed_params.cmd_params, + parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == ManifestDumpCommand::Name()) { + return new ManifestDumpCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == FileChecksumDumpCommand::Name()) { + return new FileChecksumDumpCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == GetPropertyCommand::Name()) { + return new GetPropertyCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == ListColumnFamiliesCommand::Name()) { + return new ListColumnFamiliesCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == CreateColumnFamilyCommand::Name()) { + return new CreateColumnFamilyCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == DropColumnFamilyCommand::Name()) { + return new DropColumnFamilyCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == DBFileDumperCommand::Name()) { + return new DBFileDumperCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == DBLiveFilesMetadataDumperCommand::Name()) { + return new DBLiveFilesMetadataDumperCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == InternalDumpCommand::Name()) { + return new InternalDumpCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == CheckConsistencyCommand::Name()) { + return new CheckConsistencyCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == CheckPointCommand::Name()) { + return new CheckPointCommand(parsed_params.cmd_params, + parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == RepairCommand::Name()) { + return new RepairCommand(parsed_params.cmd_params, parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == BackupCommand::Name()) { + return new BackupCommand(parsed_params.cmd_params, parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == RestoreCommand::Name()) { + return new RestoreCommand(parsed_params.cmd_params, + parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == WriteExternalSstFilesCommand::Name()) { + return new WriteExternalSstFilesCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == IngestExternalSstFilesCommand::Name()) { + return new IngestExternalSstFilesCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == ListFileRangeDeletesCommand::Name()) { + return new ListFileRangeDeletesCommand(parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == UnsafeRemoveSstFileCommand::Name()) { + return new UnsafeRemoveSstFileCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == UpdateManifestCommand::Name()) { + return new UpdateManifestCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); + } + return nullptr; +} + +/* Run the command, and return the execute result. */ +void LDBCommand::Run() { + if (!exec_state_.IsNotStarted()) { + return; + } + + if (!options_.env || options_.env == Env::Default()) { + Env* env = Env::Default(); + Status s = Env::CreateFromUri(config_options_, env_uri_, fs_uri_, &env, + &env_guard_); + if (!s.ok()) { + fprintf(stderr, "%s\n", s.ToString().c_str()); + exec_state_ = LDBCommandExecuteResult::Failed(s.ToString()); + return; + } + options_.env = env; + } + + if (db_ == nullptr && !NoDBOpen()) { + OpenDB(); + if (exec_state_.IsFailed() && try_load_options_) { + // We don't always return if there is a failure because a WAL file or + // manifest file can be given to "dump" command so we should continue. + // --try_load_options is not valid in those cases. + return; + } + } + + // We'll intentionally proceed even if the DB can't be opened because users + // can also specify a filename, not just a directory. + DoCommand(); + + if (exec_state_.IsNotStarted()) { + exec_state_ = LDBCommandExecuteResult::Succeed(""); + } + + if (db_ != nullptr) { + CloseDB(); + } +} + +LDBCommand::LDBCommand(const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags, bool is_read_only, + const std::vector<std::string>& valid_cmd_line_options) + : db_(nullptr), + db_ttl_(nullptr), + is_read_only_(is_read_only), + is_key_hex_(false), + is_value_hex_(false), + is_db_ttl_(false), + timestamp_(false), + try_load_options_(false), + create_if_missing_(false), + option_map_(options), + flags_(flags), + valid_cmd_line_options_(valid_cmd_line_options) { + auto itr = options.find(ARG_DB); + if (itr != options.end()) { + db_path_ = itr->second; + } + + itr = options.find(ARG_ENV_URI); + if (itr != options.end()) { + env_uri_ = itr->second; + } + + itr = options.find(ARG_FS_URI); + if (itr != options.end()) { + fs_uri_ = itr->second; + } + + itr = options.find(ARG_CF_NAME); + if (itr != options.end()) { + column_family_name_ = itr->second; + } else { + column_family_name_ = kDefaultColumnFamilyName; + } + + itr = options.find(ARG_SECONDARY_PATH); + secondary_path_ = ""; + if (itr != options.end()) { + secondary_path_ = itr->second; + } + + is_key_hex_ = IsKeyHex(options, flags); + is_value_hex_ = IsValueHex(options, flags); + is_db_ttl_ = IsFlagPresent(flags, ARG_TTL); + timestamp_ = IsFlagPresent(flags, ARG_TIMESTAMP); + try_load_options_ = IsTryLoadOptions(options, flags); + force_consistency_checks_ = + !IsFlagPresent(flags, ARG_DISABLE_CONSISTENCY_CHECKS); + enable_blob_files_ = IsFlagPresent(flags, ARG_ENABLE_BLOB_FILES); + enable_blob_garbage_collection_ = + IsFlagPresent(flags, ARG_ENABLE_BLOB_GARBAGE_COLLECTION); + config_options_.ignore_unknown_options = + IsFlagPresent(flags, ARG_IGNORE_UNKNOWN_OPTIONS); +} + +void LDBCommand::OpenDB() { + PrepareOptions(); + if (!exec_state_.IsNotStarted()) { + return; + } + if (column_families_.empty() && !options_.merge_operator) { + // No harm to add a general merge operator if it is not specified. + options_.merge_operator = MergeOperators::CreateStringAppendOperator(':'); + } + // Open the DB. + Status st; + std::vector<ColumnFamilyHandle*> handles_opened; + if (is_db_ttl_) { + // ldb doesn't yet support TTL DB with multiple column families + if (!column_family_name_.empty() || !column_families_.empty()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "ldb doesn't support TTL DB with multiple column families"); + } + if (!secondary_path_.empty()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Open as secondary is not supported for TTL DB yet."); + } + if (is_read_only_) { + st = DBWithTTL::Open(options_, db_path_, &db_ttl_, 0, true); + } else { + st = DBWithTTL::Open(options_, db_path_, &db_ttl_); + } + db_ = db_ttl_; + } else { + if (is_read_only_ && secondary_path_.empty()) { + if (column_families_.empty()) { + st = DB::OpenForReadOnly(options_, db_path_, &db_); + } else { + st = DB::OpenForReadOnly(options_, db_path_, column_families_, + &handles_opened, &db_); + } + } else { + if (column_families_.empty()) { + if (secondary_path_.empty()) { + st = DB::Open(options_, db_path_, &db_); + } else { + st = DB::OpenAsSecondary(options_, db_path_, secondary_path_, &db_); + } + } else { + if (secondary_path_.empty()) { + st = DB::Open(options_, db_path_, column_families_, &handles_opened, + &db_); + } else { + st = DB::OpenAsSecondary(options_, db_path_, secondary_path_, + column_families_, &handles_opened, &db_); + } + } + } + } + if (!st.ok()) { + std::string msg = st.ToString(); + exec_state_ = LDBCommandExecuteResult::Failed(msg); + } else if (!handles_opened.empty()) { + assert(handles_opened.size() == column_families_.size()); + bool found_cf_name = false; + for (size_t i = 0; i < handles_opened.size(); i++) { + cf_handles_[column_families_[i].name] = handles_opened[i]; + if (column_family_name_ == column_families_[i].name) { + found_cf_name = true; + } + } + if (!found_cf_name) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Non-existing column family " + column_family_name_); + CloseDB(); + } + } else { + // We successfully opened DB in single column family mode. + assert(column_families_.empty()); + if (column_family_name_ != kDefaultColumnFamilyName) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Non-existing column family " + column_family_name_); + CloseDB(); + } + } +} + +void LDBCommand::CloseDB() { + if (db_ != nullptr) { + for (auto& pair : cf_handles_) { + delete pair.second; + } + Status s = db_->Close(); + s.PermitUncheckedError(); + delete db_; + db_ = nullptr; + } +} + +ColumnFamilyHandle* LDBCommand::GetCfHandle() { + if (!cf_handles_.empty()) { + auto it = cf_handles_.find(column_family_name_); + if (it == cf_handles_.end()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Cannot find column family " + column_family_name_); + } else { + return it->second; + } + } + return db_->DefaultColumnFamily(); +} + +std::vector<std::string> LDBCommand::BuildCmdLineOptions( + std::vector<std::string> options) { + std::vector<std::string> ret = {ARG_ENV_URI, + ARG_FS_URI, + ARG_DB, + ARG_SECONDARY_PATH, + ARG_BLOOM_BITS, + ARG_BLOCK_SIZE, + ARG_AUTO_COMPACTION, + ARG_COMPRESSION_TYPE, + ARG_COMPRESSION_MAX_DICT_BYTES, + ARG_WRITE_BUFFER_SIZE, + ARG_FILE_SIZE, + ARG_FIX_PREFIX_LEN, + ARG_TRY_LOAD_OPTIONS, + ARG_DISABLE_CONSISTENCY_CHECKS, + ARG_ENABLE_BLOB_FILES, + ARG_MIN_BLOB_SIZE, + ARG_BLOB_FILE_SIZE, + ARG_BLOB_COMPRESSION_TYPE, + ARG_ENABLE_BLOB_GARBAGE_COLLECTION, + ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF, + ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD, + ARG_BLOB_COMPACTION_READAHEAD_SIZE, + ARG_BLOB_FILE_STARTING_LEVEL, + ARG_PREPOPULATE_BLOB_CACHE, + ARG_IGNORE_UNKNOWN_OPTIONS, + ARG_CF_NAME}; + ret.insert(ret.end(), options.begin(), options.end()); + return ret; +} + +/** + * Parses the specific double option and fills in the value. + * Returns true if the option is found. + * Returns false if the option is not found or if there is an error parsing the + * value. If there is an error, the specified exec_state is also + * updated. + */ +bool LDBCommand::ParseDoubleOption( + const std::map<std::string, std::string>& /*options*/, + const std::string& option, double& value, + LDBCommandExecuteResult& exec_state) { + auto itr = option_map_.find(option); + if (itr != option_map_.end()) { +#if defined(CYGWIN) + char* str_end = nullptr; + value = std::strtod(itr->second.c_str(), &str_end); + if (str_end == itr->second.c_str()) { + exec_state = + LDBCommandExecuteResult::Failed(option + " has an invalid value."); + } else if (errno == ERANGE) { + exec_state = LDBCommandExecuteResult::Failed( + option + " has a value out-of-range."); + } else { + return true; + } +#else + try { + value = std::stod(itr->second); + return true; + } catch (const std::invalid_argument&) { + exec_state = + LDBCommandExecuteResult::Failed(option + " has an invalid value."); + } catch (const std::out_of_range&) { + exec_state = LDBCommandExecuteResult::Failed( + option + " has a value out-of-range."); + } +#endif + } + return false; +} + +/** + * Parses the specific integer option and fills in the value. + * Returns true if the option is found. + * Returns false if the option is not found or if there is an error parsing the + * value. If there is an error, the specified exec_state is also + * updated. + */ +bool LDBCommand::ParseIntOption( + const std::map<std::string, std::string>& /*options*/, + const std::string& option, int& value, + LDBCommandExecuteResult& exec_state) { + auto itr = option_map_.find(option); + if (itr != option_map_.end()) { +#if defined(CYGWIN) + char* str_end = nullptr; + value = strtol(itr->second.c_str(), &str_end, 10); + if (str_end == itr->second.c_str()) { + exec_state = + LDBCommandExecuteResult::Failed(option + " has an invalid value."); + } else if (errno == ERANGE) { + exec_state = LDBCommandExecuteResult::Failed( + option + " has a value out-of-range."); + } else { + return true; + } +#else + try { + value = std::stoi(itr->second); + return true; + } catch (const std::invalid_argument&) { + exec_state = + LDBCommandExecuteResult::Failed(option + " has an invalid value."); + } catch (const std::out_of_range&) { + exec_state = LDBCommandExecuteResult::Failed( + option + " has a value out-of-range."); + } +#endif + } + return false; +} + +/** + * Parses the specified option and fills in the value. + * Returns true if the option is found. + * Returns false otherwise. + */ +bool LDBCommand::ParseStringOption( + const std::map<std::string, std::string>& /*options*/, + const std::string& option, std::string* value) { + auto itr = option_map_.find(option); + if (itr != option_map_.end()) { + *value = itr->second; + return true; + } + return false; +} + +/** + * Parses the specified compression type and fills in the value. + * Returns true if the compression type is found. + * Returns false otherwise. + */ +bool LDBCommand::ParseCompressionTypeOption( + const std::map<std::string, std::string>& /*options*/, + const std::string& option, CompressionType& value, + LDBCommandExecuteResult& exec_state) { + auto itr = option_map_.find(option); + if (itr != option_map_.end()) { + const std::string& comp = itr->second; + if (comp == "no") { + value = kNoCompression; + return true; + } else if (comp == "snappy") { + value = kSnappyCompression; + return true; + } else if (comp == "zlib") { + value = kZlibCompression; + return true; + } else if (comp == "bzip2") { + value = kBZip2Compression; + return true; + } else if (comp == "lz4") { + value = kLZ4Compression; + return true; + } else if (comp == "lz4hc") { + value = kLZ4HCCompression; + return true; + } else if (comp == "xpress") { + value = kXpressCompression; + return true; + } else if (comp == "zstd") { + value = kZSTD; + return true; + } else { + // Unknown compression. + exec_state = LDBCommandExecuteResult::Failed( + "Unknown compression algorithm: " + comp); + } + } + return false; +} + +void LDBCommand::OverrideBaseOptions() { + options_.create_if_missing = false; + + int db_write_buffer_size; + if (ParseIntOption(option_map_, ARG_DB_WRITE_BUFFER_SIZE, + db_write_buffer_size, exec_state_)) { + if (db_write_buffer_size >= 0) { + options_.db_write_buffer_size = db_write_buffer_size; + } else { + exec_state_ = LDBCommandExecuteResult::Failed(ARG_DB_WRITE_BUFFER_SIZE + + " must be >= 0."); + } + } + + if (options_.db_paths.size() == 0) { + options_.db_paths.emplace_back(db_path_, + std::numeric_limits<uint64_t>::max()); + } + + OverrideBaseCFOptions(static_cast<ColumnFamilyOptions*>(&options_)); +} + +void LDBCommand::OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) { + BlockBasedTableOptions table_options; + bool use_table_options = false; + int bits; + if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) { + if (bits > 0) { + use_table_options = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(bits)); + } else { + exec_state_ = + LDBCommandExecuteResult::Failed(ARG_BLOOM_BITS + " must be > 0."); + } + } + + int block_size; + if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) { + if (block_size > 0) { + use_table_options = true; + table_options.block_size = block_size; + } else { + exec_state_ = + LDBCommandExecuteResult::Failed(ARG_BLOCK_SIZE + " must be > 0."); + } + } + + cf_opts->force_consistency_checks = force_consistency_checks_; + if (use_table_options) { + cf_opts->table_factory.reset(NewBlockBasedTableFactory(table_options)); + } + + cf_opts->enable_blob_files = enable_blob_files_; + + int min_blob_size; + if (ParseIntOption(option_map_, ARG_MIN_BLOB_SIZE, min_blob_size, + exec_state_)) { + if (min_blob_size >= 0) { + cf_opts->min_blob_size = min_blob_size; + } else { + exec_state_ = + LDBCommandExecuteResult::Failed(ARG_MIN_BLOB_SIZE + " must be >= 0."); + } + } + + int blob_file_size; + if (ParseIntOption(option_map_, ARG_BLOB_FILE_SIZE, blob_file_size, + exec_state_)) { + if (blob_file_size > 0) { + cf_opts->blob_file_size = blob_file_size; + } else { + exec_state_ = + LDBCommandExecuteResult::Failed(ARG_BLOB_FILE_SIZE + " must be > 0."); + } + } + + cf_opts->enable_blob_garbage_collection = enable_blob_garbage_collection_; + + double blob_garbage_collection_age_cutoff; + if (ParseDoubleOption(option_map_, ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF, + blob_garbage_collection_age_cutoff, exec_state_)) { + if (blob_garbage_collection_age_cutoff >= 0 && + blob_garbage_collection_age_cutoff <= 1) { + cf_opts->blob_garbage_collection_age_cutoff = + blob_garbage_collection_age_cutoff; + } else { + exec_state_ = LDBCommandExecuteResult::Failed( + ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF + " must be >= 0 and <= 1."); + } + } + + double blob_garbage_collection_force_threshold; + if (ParseDoubleOption(option_map_, + ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD, + blob_garbage_collection_force_threshold, exec_state_)) { + if (blob_garbage_collection_force_threshold >= 0 && + blob_garbage_collection_force_threshold <= 1) { + cf_opts->blob_garbage_collection_force_threshold = + blob_garbage_collection_force_threshold; + } else { + exec_state_ = LDBCommandExecuteResult::Failed( + ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD + + " must be >= 0 and <= 1."); + } + } + + int blob_compaction_readahead_size; + if (ParseIntOption(option_map_, ARG_BLOB_COMPACTION_READAHEAD_SIZE, + blob_compaction_readahead_size, exec_state_)) { + if (blob_compaction_readahead_size > 0) { + cf_opts->blob_compaction_readahead_size = blob_compaction_readahead_size; + } else { + exec_state_ = LDBCommandExecuteResult::Failed( + ARG_BLOB_COMPACTION_READAHEAD_SIZE + " must be > 0."); + } + } + + int blob_file_starting_level; + if (ParseIntOption(option_map_, ARG_BLOB_FILE_STARTING_LEVEL, + blob_file_starting_level, exec_state_)) { + if (blob_file_starting_level >= 0) { + cf_opts->blob_file_starting_level = blob_file_starting_level; + } else { + exec_state_ = LDBCommandExecuteResult::Failed( + ARG_BLOB_FILE_STARTING_LEVEL + " must be >= 0."); + } + } + + int prepopulate_blob_cache; + if (ParseIntOption(option_map_, ARG_PREPOPULATE_BLOB_CACHE, + prepopulate_blob_cache, exec_state_)) { + switch (prepopulate_blob_cache) { + case 0: + cf_opts->prepopulate_blob_cache = PrepopulateBlobCache::kDisable; + break; + case 1: + cf_opts->prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly; + break; + default: + exec_state_ = LDBCommandExecuteResult::Failed( + ARG_PREPOPULATE_BLOB_CACHE + + " must be 0 (disable) or 1 (flush only)."); + } + } + + auto itr = option_map_.find(ARG_AUTO_COMPACTION); + if (itr != option_map_.end()) { + cf_opts->disable_auto_compactions = !StringToBool(itr->second); + } + + CompressionType compression_type; + if (ParseCompressionTypeOption(option_map_, ARG_COMPRESSION_TYPE, + compression_type, exec_state_)) { + cf_opts->compression = compression_type; + } + + CompressionType blob_compression_type; + if (ParseCompressionTypeOption(option_map_, ARG_BLOB_COMPRESSION_TYPE, + blob_compression_type, exec_state_)) { + cf_opts->blob_compression_type = blob_compression_type; + } + + int compression_max_dict_bytes; + if (ParseIntOption(option_map_, ARG_COMPRESSION_MAX_DICT_BYTES, + compression_max_dict_bytes, exec_state_)) { + if (compression_max_dict_bytes >= 0) { + cf_opts->compression_opts.max_dict_bytes = compression_max_dict_bytes; + } else { + exec_state_ = LDBCommandExecuteResult::Failed( + ARG_COMPRESSION_MAX_DICT_BYTES + " must be >= 0."); + } + } + + int write_buffer_size; + if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size, + exec_state_)) { + if (write_buffer_size > 0) { + cf_opts->write_buffer_size = write_buffer_size; + } else { + exec_state_ = LDBCommandExecuteResult::Failed(ARG_WRITE_BUFFER_SIZE + + " must be > 0."); + } + } + + int file_size; + if (ParseIntOption(option_map_, ARG_FILE_SIZE, file_size, exec_state_)) { + if (file_size > 0) { + cf_opts->target_file_size_base = file_size; + } else { + exec_state_ = + LDBCommandExecuteResult::Failed(ARG_FILE_SIZE + " must be > 0."); + } + } + + int fix_prefix_len; + if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN, fix_prefix_len, + exec_state_)) { + if (fix_prefix_len > 0) { + cf_opts->prefix_extractor.reset( + NewFixedPrefixTransform(static_cast<size_t>(fix_prefix_len))); + } else { + exec_state_ = + LDBCommandExecuteResult::Failed(ARG_FIX_PREFIX_LEN + " must be > 0."); + } + } +} + +// First, initializes the options state using the OPTIONS file when enabled. +// Second, overrides the options according to the CLI arguments and the +// specific subcommand being run. +void LDBCommand::PrepareOptions() { + if (!create_if_missing_ && try_load_options_) { + config_options_.env = options_.env; + Status s = LoadLatestOptions(config_options_, db_path_, &options_, + &column_families_); + if (!s.ok() && !s.IsNotFound()) { + // Option file exists but load option file error. + std::string msg = s.ToString(); + exec_state_ = LDBCommandExecuteResult::Failed(msg); + db_ = nullptr; + return; + } + if (!options_.wal_dir.empty()) { + if (options_.env->FileExists(options_.wal_dir).IsNotFound()) { + options_.wal_dir = db_path_; + fprintf( + stderr, + "wal_dir loaded from the option file doesn't exist. Ignore it.\n"); + } + } + + // If merge operator is not set, set a string append operator. + for (auto& cf_entry : column_families_) { + if (!cf_entry.options.merge_operator) { + cf_entry.options.merge_operator = + MergeOperators::CreateStringAppendOperator(':'); + } + } + } + + if (options_.env == Env::Default()) { + options_.env = config_options_.env; + } + + OverrideBaseOptions(); + if (exec_state_.IsFailed()) { + return; + } + + if (column_families_.empty()) { + // Reads the MANIFEST to figure out what column families exist. In this + // case, the option overrides from the CLI argument/specific subcommand + // apply to all column families. + std::vector<std::string> cf_list; + Status st = DB::ListColumnFamilies(options_, db_path_, &cf_list); + // It is possible the DB doesn't exist yet, for "create if not + // existing" case. The failure is ignored here. We rely on DB::Open() + // to give us the correct error message for problem with opening + // existing DB. + if (st.ok() && cf_list.size() > 1) { + // Ignore single column family DB. + for (auto cf_name : cf_list) { + column_families_.emplace_back(cf_name, options_); + } + } + } else { + // We got column families from the OPTIONS file. In this case, the option + // overrides from the CLI argument/specific subcommand only apply to the + // column family specified by `--column_family_name`. + auto column_families_iter = + std::find_if(column_families_.begin(), column_families_.end(), + [this](const ColumnFamilyDescriptor& cf_desc) { + return cf_desc.name == column_family_name_; + }); + if (column_families_iter == column_families_.end()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Non-existing column family " + column_family_name_); + return; + } + OverrideBaseCFOptions(&column_families_iter->options); + } +} + +bool LDBCommand::ParseKeyValue(const std::string& line, std::string* key, + std::string* value, bool is_key_hex, + bool is_value_hex) { + size_t pos = line.find(DELIM); + if (pos != std::string::npos) { + *key = line.substr(0, pos); + *value = line.substr(pos + strlen(DELIM)); + if (is_key_hex) { + *key = HexToString(*key); + } + if (is_value_hex) { + *value = HexToString(*value); + } + return true; + } else { + return false; + } +} + +/** + * Make sure that ONLY the command-line options and flags expected by this + * command are specified on the command-line. Extraneous options are usually + * the result of user error. + * Returns true if all checks pass. Else returns false, and prints an + * appropriate error msg to stderr. + */ +bool LDBCommand::ValidateCmdLineOptions() { + for (auto itr = option_map_.begin(); itr != option_map_.end(); ++itr) { + if (std::find(valid_cmd_line_options_.begin(), + valid_cmd_line_options_.end(), + itr->first) == valid_cmd_line_options_.end()) { + fprintf(stderr, "Invalid command-line option %s\n", itr->first.c_str()); + return false; + } + } + + for (std::vector<std::string>::const_iterator itr = flags_.begin(); + itr != flags_.end(); ++itr) { + if (std::find(valid_cmd_line_options_.begin(), + valid_cmd_line_options_.end(), + *itr) == valid_cmd_line_options_.end()) { + fprintf(stderr, "Invalid command-line flag %s\n", itr->c_str()); + return false; + } + } + + if (!NoDBOpen() && option_map_.find(ARG_DB) == option_map_.end() && + option_map_.find(ARG_PATH) == option_map_.end()) { + fprintf(stderr, "Either %s or %s must be specified.\n", ARG_DB.c_str(), + ARG_PATH.c_str()); + return false; + } + + return true; +} + +std::string LDBCommand::HexToString(const std::string& str) { + std::string result; + std::string::size_type len = str.length(); + if (len < 2 || str[0] != '0' || str[1] != 'x') { + fprintf(stderr, "Invalid hex input %s. Must start with 0x\n", str.c_str()); + throw "Invalid hex input"; + } + if (!Slice(str.data() + 2, len - 2).DecodeHex(&result)) { + throw "Invalid hex input"; + } + return result; +} + +std::string LDBCommand::StringToHex(const std::string& str) { + std::string result("0x"); + result.append(Slice(str).ToString(true)); + return result; +} + +std::string LDBCommand::PrintKeyValue(const std::string& key, + const std::string& value, bool is_key_hex, + bool is_value_hex) { + std::string result; + result.append(is_key_hex ? StringToHex(key) : key); + result.append(DELIM); + result.append(is_value_hex ? StringToHex(value) : value); + return result; +} + +std::string LDBCommand::PrintKeyValue(const std::string& key, + const std::string& value, bool is_hex) { + return PrintKeyValue(key, value, is_hex, is_hex); +} + +std::string LDBCommand::HelpRangeCmdArgs() { + std::ostringstream str_stream; + str_stream << " "; + str_stream << "[--" << ARG_FROM << "] "; + str_stream << "[--" << ARG_TO << "] "; + return str_stream.str(); +} + +bool LDBCommand::IsKeyHex(const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) { + return (IsFlagPresent(flags, ARG_HEX) || IsFlagPresent(flags, ARG_KEY_HEX) || + ParseBooleanOption(options, ARG_HEX, false) || + ParseBooleanOption(options, ARG_KEY_HEX, false)); +} + +bool LDBCommand::IsValueHex(const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) { + return (IsFlagPresent(flags, ARG_HEX) || + IsFlagPresent(flags, ARG_VALUE_HEX) || + ParseBooleanOption(options, ARG_HEX, false) || + ParseBooleanOption(options, ARG_VALUE_HEX, false)); +} + +bool LDBCommand::IsTryLoadOptions( + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) { + if (IsFlagPresent(flags, ARG_TRY_LOAD_OPTIONS)) { + return true; + } + // if `DB` is specified and not explicitly to create a new db, default + // `try_load_options` to true. The user could still disable that by set + // `try_load_options=false`. + // Note: Opening as TTL DB doesn't support `try_load_options`, so it's default + // to false. TODO: TTL_DB may need to fix that, otherwise it's unable to open + // DB which has incompatible setting with default options. + bool default_val = (options.find(ARG_DB) != options.end()) && + !IsFlagPresent(flags, ARG_CREATE_IF_MISSING) && + !IsFlagPresent(flags, ARG_TTL); + return ParseBooleanOption(options, ARG_TRY_LOAD_OPTIONS, default_val); +} + +bool LDBCommand::ParseBooleanOption( + const std::map<std::string, std::string>& options, + const std::string& option, bool default_val) { + auto itr = options.find(option); + if (itr != options.end()) { + std::string option_val = itr->second; + return StringToBool(itr->second); + } + return default_val; +} + +bool LDBCommand::StringToBool(std::string val) { + std::transform(val.begin(), val.end(), val.begin(), + [](char ch) -> char { return (char)::tolower(ch); }); + + if (val == "true") { + return true; + } else if (val == "false") { + return false; + } else { + throw "Invalid value for boolean argument"; + } +} + +CompactorCommand::CompactorCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_FROM, ARG_TO, ARG_HEX, ARG_KEY_HEX, + ARG_VALUE_HEX, ARG_TTL})), + null_from_(true), + null_to_(true) { + auto itr = options.find(ARG_FROM); + if (itr != options.end()) { + null_from_ = false; + from_ = itr->second; + } + + itr = options.find(ARG_TO); + if (itr != options.end()) { + null_to_ = false; + to_ = itr->second; + } + + if (is_key_hex_) { + if (!null_from_) { + from_ = HexToString(from_); + } + if (!null_to_) { + to_ = HexToString(to_); + } + } +} + +void CompactorCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(CompactorCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append("\n"); +} + +void CompactorCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + + Slice* begin = nullptr; + Slice* end = nullptr; + if (!null_from_) { + begin = new Slice(from_); + } + if (!null_to_) { + end = new Slice(to_); + } + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + + Status s = db_->CompactRange(cro, GetCfHandle(), begin, end); + if (!s.ok()) { + std::stringstream oss; + oss << "Compaction failed: " << s.ToString(); + exec_state_ = LDBCommandExecuteResult::Failed(oss.str()); + } else { + exec_state_ = LDBCommandExecuteResult::Succeed(""); + } + + delete begin; + delete end; +} + +// --------------------------------------------------------------------------- +const std::string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal"; +const std::string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load"; +const std::string DBLoaderCommand::ARG_COMPACT = "compact"; + +DBLoaderCommand::DBLoaderCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand( + options, flags, false, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, + ARG_TO, ARG_CREATE_IF_MISSING, ARG_DISABLE_WAL, + ARG_BULK_LOAD, ARG_COMPACT})), + disable_wal_(false), + bulk_load_(false), + compact_(false) { + create_if_missing_ = IsFlagPresent(flags, ARG_CREATE_IF_MISSING); + disable_wal_ = IsFlagPresent(flags, ARG_DISABLE_WAL); + bulk_load_ = IsFlagPresent(flags, ARG_BULK_LOAD); + compact_ = IsFlagPresent(flags, ARG_COMPACT); +} + +void DBLoaderCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(DBLoaderCommand::Name()); + ret.append(" [--" + ARG_CREATE_IF_MISSING + "]"); + ret.append(" [--" + ARG_DISABLE_WAL + "]"); + ret.append(" [--" + ARG_BULK_LOAD + "]"); + ret.append(" [--" + ARG_COMPACT + "]"); + ret.append("\n"); +} + +void DBLoaderCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + options_.create_if_missing = create_if_missing_; + if (bulk_load_) { + options_.PrepareForBulkLoad(); + } +} + +void DBLoaderCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + + WriteOptions write_options; + if (disable_wal_) { + write_options.disableWAL = true; + } + + int bad_lines = 0; + std::string line; + // prefer ifstream getline performance vs that from std::cin istream + std::ifstream ifs_stdin("/dev/stdin"); + std::istream* istream_p = ifs_stdin.is_open() ? &ifs_stdin : &std::cin; + Status s; + while (s.ok() && getline(*istream_p, line, '\n')) { + std::string key; + std::string value; + if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) { + s = db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value)); + } else if (0 == line.find("Keys in range:")) { + // ignore this line + } else if (0 == line.find("Created bg thread 0x")) { + // ignore this line + } else { + bad_lines++; + } + } + + if (bad_lines > 0) { + std::cout << "Warning: " << bad_lines << " bad lines ignored." << std::endl; + } + if (!s.ok()) { + std::stringstream oss; + oss << "Load failed: " << s.ToString(); + exec_state_ = LDBCommandExecuteResult::Failed(oss.str()); + } + if (compact_ && s.ok()) { + s = db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, + nullptr); + } + if (!s.ok()) { + std::stringstream oss; + oss << "Compaction failed: " << s.ToString(); + exec_state_ = LDBCommandExecuteResult::Failed(oss.str()); + } +} + +// ---------------------------------------------------------------------------- + +namespace { + +void DumpManifestFile(Options options, std::string file, bool verbose, bool hex, + bool json) { + EnvOptions sopt; + std::string dbname("dummy"); + std::shared_ptr<Cache> tc(NewLRUCache(options.max_open_files - 10, + options.table_cache_numshardbits)); + // Notice we are using the default options not through SanitizeOptions(), + // if VersionSet::DumpManifest() depends on any option done by + // SanitizeOptions(), we need to initialize it manually. + options.db_paths.emplace_back("dummy", 0); + options.num_levels = 64; + WriteController wc(options.delayed_write_rate); + WriteBufferManager wb(options.db_write_buffer_size); + ImmutableDBOptions immutable_db_options(options); + VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ ""); + Status s = versions.DumpManifest(options, file, verbose, hex, json); + if (!s.ok()) { + fprintf(stderr, "Error in processing file %s %s\n", file.c_str(), + s.ToString().c_str()); + } +} + +} // namespace + +const std::string ManifestDumpCommand::ARG_VERBOSE = "verbose"; +const std::string ManifestDumpCommand::ARG_JSON = "json"; +const std::string ManifestDumpCommand::ARG_PATH = "path"; + +void ManifestDumpCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(ManifestDumpCommand::Name()); + ret.append(" [--" + ARG_VERBOSE + "]"); + ret.append(" [--" + ARG_JSON + "]"); + ret.append(" [--" + ARG_PATH + "=<path_to_manifest_file>]"); + ret.append("\n"); +} + +ManifestDumpCommand::ManifestDumpCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand( + options, flags, false, + BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX, ARG_JSON})), + verbose_(false), + json_(false), + path_("") { + verbose_ = IsFlagPresent(flags, ARG_VERBOSE); + json_ = IsFlagPresent(flags, ARG_JSON); + + auto itr = options.find(ARG_PATH); + if (itr != options.end()) { + path_ = itr->second; + if (path_.empty()) { + exec_state_ = LDBCommandExecuteResult::Failed("--path: missing pathname"); + } + } +} + +void ManifestDumpCommand::DoCommand() { + std::string manifestfile; + + if (!path_.empty()) { + manifestfile = path_; + } else { + // We need to find the manifest file by searching the directory + // containing the db for files of the form MANIFEST_[0-9]+ + + std::vector<std::string> files; + Status s = options_.env->GetChildren(db_path_, &files); + if (!s.ok()) { + std::string err_msg = s.ToString(); + err_msg.append(": Failed to list the content of "); + err_msg.append(db_path_); + exec_state_ = LDBCommandExecuteResult::Failed(err_msg); + return; + } + const std::string kManifestNamePrefix = "MANIFEST-"; + std::string matched_file; +#ifdef OS_WIN + const char kPathDelim = '\\'; +#else + const char kPathDelim = '/'; +#endif + for (const auto& file_path : files) { + // Some Env::GetChildren() return absolute paths. Some directories' path + // end with path delim, e.g. '/' or '\\'. + size_t pos = file_path.find_last_of(kPathDelim); + if (pos == file_path.size() - 1) { + continue; + } + std::string fname; + if (pos != std::string::npos) { + // Absolute path. + fname.assign(file_path, pos + 1, file_path.size() - pos - 1); + } else { + fname = file_path; + } + uint64_t file_num = 0; + FileType file_type = kWalFile; // Just for initialization + if (ParseFileName(fname, &file_num, &file_type) && + file_type == kDescriptorFile) { + if (!matched_file.empty()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Multiple MANIFEST files found; use --path to select one"); + return; + } else { + matched_file.swap(fname); + } + } + } + if (matched_file.empty()) { + std::string err_msg("No MANIFEST found in "); + err_msg.append(db_path_); + exec_state_ = LDBCommandExecuteResult::Failed(err_msg); + return; + } + if (db_path_.back() != '/') { + db_path_.append("/"); + } + manifestfile = db_path_ + matched_file; + } + + if (verbose_) { + fprintf(stdout, "Processing Manifest file %s\n", manifestfile.c_str()); + } + + DumpManifestFile(options_, manifestfile, verbose_, is_key_hex_, json_); + + if (verbose_) { + fprintf(stdout, "Processing Manifest file %s done\n", manifestfile.c_str()); + } +} + +// ---------------------------------------------------------------------------- +namespace { + +Status GetLiveFilesChecksumInfoFromVersionSet(Options options, + const std::string& db_path, + FileChecksumList* checksum_list) { + EnvOptions sopt; + Status s; + std::string dbname(db_path); + std::shared_ptr<Cache> tc(NewLRUCache(options.max_open_files - 10, + options.table_cache_numshardbits)); + // Notice we are using the default options not through SanitizeOptions(), + // if VersionSet::GetLiveFilesChecksumInfo depends on any option done by + // SanitizeOptions(), we need to initialize it manually. + options.db_paths.emplace_back(db_path, 0); + options.num_levels = 64; + WriteController wc(options.delayed_write_rate); + WriteBufferManager wb(options.db_write_buffer_size); + ImmutableDBOptions immutable_db_options(options); + VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ ""); + std::vector<std::string> cf_name_list; + s = versions.ListColumnFamilies(&cf_name_list, db_path, + immutable_db_options.fs.get()); + if (s.ok()) { + std::vector<ColumnFamilyDescriptor> cf_list; + for (const auto& name : cf_name_list) { + cf_list.emplace_back(name, ColumnFamilyOptions(options)); + } + s = versions.Recover(cf_list, true); + } + if (s.ok()) { + s = versions.GetLiveFilesChecksumInfo(checksum_list); + } + return s; +} + +} // namespace + +const std::string FileChecksumDumpCommand::ARG_PATH = "path"; + +void FileChecksumDumpCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(FileChecksumDumpCommand::Name()); + ret.append(" [--" + ARG_PATH + "=<path_to_manifest_file>]"); + ret.append("\n"); +} + +FileChecksumDumpCommand::FileChecksumDumpCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_PATH, ARG_HEX})), + path_("") { + auto itr = options.find(ARG_PATH); + if (itr != options.end()) { + path_ = itr->second; + if (path_.empty()) { + exec_state_ = LDBCommandExecuteResult::Failed("--path: missing pathname"); + } + } + is_checksum_hex_ = IsFlagPresent(flags, ARG_HEX); +} + +void FileChecksumDumpCommand::DoCommand() { + // print out the checksum information in the following format: + // sst file number, checksum function name, checksum value + // sst file number, checksum function name, checksum value + // ...... + + std::unique_ptr<FileChecksumList> checksum_list(NewFileChecksumList()); + Status s = GetLiveFilesChecksumInfoFromVersionSet(options_, db_path_, + checksum_list.get()); + if (s.ok() && checksum_list != nullptr) { + std::vector<uint64_t> file_numbers; + std::vector<std::string> checksums; + std::vector<std::string> checksum_func_names; + s = checksum_list->GetAllFileChecksums(&file_numbers, &checksums, + &checksum_func_names); + if (s.ok()) { + for (size_t i = 0; i < file_numbers.size(); i++) { + assert(i < file_numbers.size()); + assert(i < checksums.size()); + assert(i < checksum_func_names.size()); + std::string checksum; + if (is_checksum_hex_) { + checksum = StringToHex(checksums[i]); + } else { + checksum = std::move(checksums[i]); + } + fprintf(stdout, "%" PRId64 ", %s, %s\n", file_numbers[i], + checksum_func_names[i].c_str(), checksum.c_str()); + } + fprintf(stdout, "Print SST file checksum information finished \n"); + } + } + + if (!s.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed(s.ToString()); + } +} + +// ---------------------------------------------------------------------------- + +void GetPropertyCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(GetPropertyCommand::Name()); + ret.append(" <property_name>"); + ret.append("\n"); +} + +GetPropertyCommand::GetPropertyCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, true, BuildCmdLineOptions({})) { + if (params.size() != 1) { + exec_state_ = + LDBCommandExecuteResult::Failed("property name must be specified"); + } else { + property_ = params[0]; + } +} + +void GetPropertyCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + + std::map<std::string, std::string> value_map; + std::string value; + + // Rather than having different ldb command for map properties vs. string + // properties, we simply try Map property first. (This order only chosen + // because I prefer the map-style output for + // "rocksdb.aggregated-table-properties".) + if (db_->GetMapProperty(GetCfHandle(), property_, &value_map)) { + if (value_map.empty()) { + fprintf(stdout, "%s: <empty map>\n", property_.c_str()); + } else { + for (auto& e : value_map) { + fprintf(stdout, "%s.%s: %s\n", property_.c_str(), e.first.c_str(), + e.second.c_str()); + } + } + } else if (db_->GetProperty(GetCfHandle(), property_, &value)) { + fprintf(stdout, "%s: %s\n", property_.c_str(), value.c_str()); + } else { + exec_state_ = + LDBCommandExecuteResult::Failed("failed to get property: " + property_); + } +} + +// ---------------------------------------------------------------------------- + +void ListColumnFamiliesCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(ListColumnFamiliesCommand::Name()); + ret.append("\n"); +} + +ListColumnFamiliesCommand::ListColumnFamiliesCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {} + +void ListColumnFamiliesCommand::DoCommand() { + std::vector<std::string> column_families; + Status s = DB::ListColumnFamilies(options_, db_path_, &column_families); + if (!s.ok()) { + fprintf(stderr, "Error in processing db %s %s\n", db_path_.c_str(), + s.ToString().c_str()); + } else { + fprintf(stdout, "Column families in %s: \n{", db_path_.c_str()); + bool first = true; + for (auto cf : column_families) { + if (!first) { + fprintf(stdout, ", "); + } + first = false; + fprintf(stdout, "%s", cf.c_str()); + } + fprintf(stdout, "}\n"); + } +} + +void CreateColumnFamilyCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(CreateColumnFamilyCommand::Name()); + ret.append(" --db=<db_path> <new_column_family_name>"); + ret.append("\n"); +} + +CreateColumnFamilyCommand::CreateColumnFamilyCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, true, {ARG_DB}) { + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::Failed( + "new column family name must be specified"); + } else { + new_cf_name_ = params[0]; + } +} + +void CreateColumnFamilyCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + ColumnFamilyHandle* new_cf_handle = nullptr; + Status st = db_->CreateColumnFamily(options_, new_cf_name_, &new_cf_handle); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed( + "Fail to create new column family: " + st.ToString()); + } + delete new_cf_handle; + CloseDB(); +} + +void DropColumnFamilyCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(DropColumnFamilyCommand::Name()); + ret.append(" --db=<db_path> <column_family_name_to_drop>"); + ret.append("\n"); +} + +DropColumnFamilyCommand::DropColumnFamilyCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, true, {ARG_DB}) { + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::Failed( + "The name of column family to drop must be specified"); + } else { + cf_name_to_drop_ = params[0]; + } +} + +void DropColumnFamilyCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + auto iter = cf_handles_.find(cf_name_to_drop_); + if (iter == cf_handles_.end()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Column family: " + cf_name_to_drop_ + " doesn't exist in db."); + return; + } + ColumnFamilyHandle* cf_handle_to_drop = iter->second; + Status st = db_->DropColumnFamily(cf_handle_to_drop); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed( + "Fail to drop column family: " + st.ToString()); + } + CloseDB(); +} + +// ---------------------------------------------------------------------------- +namespace { + +// This function only called when it's the sane case of >1 buckets in time-range +// Also called only when timekv falls between ttl_start and ttl_end provided +void IncBucketCounts(std::vector<uint64_t>& bucket_counts, int ttl_start, + int time_range, int bucket_size, int timekv, + int num_buckets) { +#ifdef NDEBUG + (void)time_range; + (void)num_buckets; +#endif + assert(time_range > 0 && timekv >= ttl_start && bucket_size > 0 && + timekv < (ttl_start + time_range) && num_buckets > 1); + int bucket = (timekv - ttl_start) / bucket_size; + bucket_counts[bucket]++; +} + +void PrintBucketCounts(const std::vector<uint64_t>& bucket_counts, + int ttl_start, int ttl_end, int bucket_size, + int num_buckets) { + int time_point = ttl_start; + for (int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) { + fprintf(stdout, "Keys in range %s to %s : %lu\n", + TimeToHumanString(time_point).c_str(), + TimeToHumanString(time_point + bucket_size).c_str(), + (unsigned long)bucket_counts[i]); + } + fprintf(stdout, "Keys in range %s to %s : %lu\n", + TimeToHumanString(time_point).c_str(), + TimeToHumanString(ttl_end).c_str(), + (unsigned long)bucket_counts[num_buckets - 1]); +} + +} // namespace + +const std::string InternalDumpCommand::ARG_COUNT_ONLY = "count_only"; +const std::string InternalDumpCommand::ARG_COUNT_DELIM = "count_delim"; +const std::string InternalDumpCommand::ARG_STATS = "stats"; +const std::string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex"; + +InternalDumpCommand::InternalDumpCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, true, + BuildCmdLineOptions( + {ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO, + ARG_MAX_KEYS, ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS, + ARG_INPUT_KEY_HEX, ARG_DECODE_BLOB_INDEX})), + has_from_(false), + has_to_(false), + max_keys_(-1), + delim_("."), + count_only_(false), + count_delim_(false), + print_stats_(false), + is_input_key_hex_(false), + decode_blob_index_(false) { + has_from_ = ParseStringOption(options, ARG_FROM, &from_); + has_to_ = ParseStringOption(options, ARG_TO, &to_); + + ParseIntOption(options, ARG_MAX_KEYS, max_keys_, exec_state_); + auto itr = options.find(ARG_COUNT_DELIM); + if (itr != options.end()) { + delim_ = itr->second; + count_delim_ = true; + // fprintf(stdout,"delim = %c\n",delim_[0]); + } else { + count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM); + delim_ = "."; + } + + print_stats_ = IsFlagPresent(flags, ARG_STATS); + count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY); + is_input_key_hex_ = IsFlagPresent(flags, ARG_INPUT_KEY_HEX); + decode_blob_index_ = IsFlagPresent(flags, ARG_DECODE_BLOB_INDEX); + + if (is_input_key_hex_) { + if (has_from_) { + from_ = HexToString(from_); + } + if (has_to_) { + to_ = HexToString(to_); + } + } +} + +void InternalDumpCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(InternalDumpCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_INPUT_KEY_HEX + "]"); + ret.append(" [--" + ARG_MAX_KEYS + "=<N>]"); + ret.append(" [--" + ARG_COUNT_ONLY + "]"); + ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]"); + ret.append(" [--" + ARG_STATS + "]"); + ret.append(" [--" + ARG_DECODE_BLOB_INDEX + "]"); + ret.append("\n"); +} + +void InternalDumpCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + + if (print_stats_) { + std::string stats; + if (db_->GetProperty(GetCfHandle(), "rocksdb.stats", &stats)) { + fprintf(stdout, "%s\n", stats.c_str()); + } + } + + // Cast as DBImpl to get internal iterator + std::vector<KeyVersion> key_versions; + Status st = GetAllKeyVersions(db_, GetCfHandle(), from_, to_, max_keys_, + &key_versions); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed(st.ToString()); + return; + } + std::string rtype1, rtype2, row, val; + rtype2 = ""; + uint64_t c = 0; + uint64_t s1 = 0, s2 = 0; + + long long count = 0; + for (auto& key_version : key_versions) { + ValueType value_type = static_cast<ValueType>(key_version.type); + InternalKey ikey(key_version.user_key, key_version.sequence, value_type); + if (has_to_ && ikey.user_key() == to_) { + // GetAllKeyVersions() includes keys with user key `to_`, but idump has + // traditionally excluded such keys. + break; + } + ++count; + int k; + if (count_delim_) { + rtype1 = ""; + s1 = 0; + row = ikey.Encode().ToString(); + val = key_version.value; + for (k = 0; row[k] != '\x01' && row[k] != '\0'; k++) s1++; + for (k = 0; val[k] != '\x01' && val[k] != '\0'; k++) s1++; + for (int j = 0; row[j] != delim_[0] && row[j] != '\0' && row[j] != '\x01'; + j++) + rtype1 += row[j]; + if (rtype2.compare("") && rtype2.compare(rtype1) != 0) { + fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n", + rtype2.c_str(), c, s2); + c = 1; + s2 = s1; + rtype2 = rtype1; + } else { + c++; + s2 += s1; + rtype2 = rtype1; + } + } + + if (!count_only_ && !count_delim_) { + std::string key = ikey.DebugString(is_key_hex_); + Slice value(key_version.value); + if (!decode_blob_index_ || value_type != kTypeBlobIndex) { + fprintf(stdout, "%s => %s\n", key.c_str(), + value.ToString(is_value_hex_).c_str()); + } else { + BlobIndex blob_index; + + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + fprintf(stderr, "%s => error decoding blob index =>\n", key.c_str()); + } else { + fprintf(stdout, "%s => %s\n", key.c_str(), + blob_index.DebugString(is_value_hex_).c_str()); + } + } + } + + // Terminate if maximum number of keys have been dumped + if (max_keys_ > 0 && count >= max_keys_) break; + } + if (count_delim_) { + fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n", + rtype2.c_str(), c, s2); + } else { + fprintf(stdout, "Internal keys in range: %lld\n", count); + } +} + +const std::string DBDumperCommand::ARG_COUNT_ONLY = "count_only"; +const std::string DBDumperCommand::ARG_COUNT_DELIM = "count_delim"; +const std::string DBDumperCommand::ARG_STATS = "stats"; +const std::string DBDumperCommand::ARG_TTL_BUCKET = "bucket"; + +DBDumperCommand::DBDumperCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand( + options, flags, true, + BuildCmdLineOptions( + {ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO, + ARG_MAX_KEYS, ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS, + ARG_TTL_START, ARG_TTL_END, ARG_TTL_BUCKET, ARG_TIMESTAMP, + ARG_PATH, ARG_DECODE_BLOB_INDEX, ARG_DUMP_UNCOMPRESSED_BLOBS})), + null_from_(true), + null_to_(true), + max_keys_(-1), + count_only_(false), + count_delim_(false), + print_stats_(false), + decode_blob_index_(false) { + auto itr = options.find(ARG_FROM); + if (itr != options.end()) { + null_from_ = false; + from_ = itr->second; + } + + itr = options.find(ARG_TO); + if (itr != options.end()) { + null_to_ = false; + to_ = itr->second; + } + + itr = options.find(ARG_MAX_KEYS); + if (itr != options.end()) { + try { +#if defined(CYGWIN) + max_keys_ = strtol(itr->second.c_str(), 0, 10); +#else + max_keys_ = std::stoi(itr->second); +#endif + } catch (const std::invalid_argument&) { + exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS + + " has an invalid value"); + } catch (const std::out_of_range&) { + exec_state_ = LDBCommandExecuteResult::Failed( + ARG_MAX_KEYS + " has a value out-of-range"); + } + } + itr = options.find(ARG_COUNT_DELIM); + if (itr != options.end()) { + delim_ = itr->second; + count_delim_ = true; + } else { + count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM); + delim_ = "."; + } + + print_stats_ = IsFlagPresent(flags, ARG_STATS); + count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY); + decode_blob_index_ = IsFlagPresent(flags, ARG_DECODE_BLOB_INDEX); + dump_uncompressed_blobs_ = IsFlagPresent(flags, ARG_DUMP_UNCOMPRESSED_BLOBS); + + if (is_key_hex_) { + if (!null_from_) { + from_ = HexToString(from_); + } + if (!null_to_) { + to_ = HexToString(to_); + } + } + + itr = options.find(ARG_PATH); + if (itr != options.end()) { + path_ = itr->second; + if (db_path_.empty()) { + db_path_ = path_; + } + } +} + +void DBDumperCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(DBDumperCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_TTL + "]"); + ret.append(" [--" + ARG_MAX_KEYS + "=<N>]"); + ret.append(" [--" + ARG_TIMESTAMP + "]"); + ret.append(" [--" + ARG_COUNT_ONLY + "]"); + ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]"); + ret.append(" [--" + ARG_STATS + "]"); + ret.append(" [--" + ARG_TTL_BUCKET + "=<N>]"); + ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]"); + ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]"); + ret.append(" [--" + ARG_PATH + "=<path_to_a_file>]"); + ret.append(" [--" + ARG_DECODE_BLOB_INDEX + "]"); + ret.append(" [--" + ARG_DUMP_UNCOMPRESSED_BLOBS + "]"); + ret.append("\n"); +} + +/** + * Handles two separate cases: + * + * 1) --db is specified - just dump the database. + * + * 2) --path is specified - determine based on file extension what dumping + * function to call. Please note that we intentionally use the extension + * and avoid probing the file contents under the assumption that renaming + * the files is not a supported scenario. + * + */ +void DBDumperCommand::DoCommand() { + if (!db_) { + assert(!path_.empty()); + std::string fileName = GetFileNameFromPath(path_); + uint64_t number; + FileType type; + + exec_state_ = LDBCommandExecuteResult::Succeed(""); + + if (!ParseFileName(fileName, &number, &type)) { + exec_state_ = + LDBCommandExecuteResult::Failed("Can't parse file type: " + path_); + return; + } + + switch (type) { + case kWalFile: + // TODO(myabandeh): allow configuring is_write_commited + DumpWalFile(options_, path_, /* print_header_ */ true, + /* print_values_ */ true, true /* is_write_commited */, + &exec_state_); + break; + case kTableFile: + DumpSstFile(options_, path_, is_key_hex_, /* show_properties */ true, + decode_blob_index_, from_, to_); + break; + case kDescriptorFile: + DumpManifestFile(options_, path_, /* verbose_ */ false, is_key_hex_, + /* json_ */ false); + break; + case kBlobFile: + DumpBlobFile(path_, is_key_hex_, is_value_hex_, + dump_uncompressed_blobs_); + break; + default: + exec_state_ = LDBCommandExecuteResult::Failed( + "File type not supported: " + path_); + break; + } + + } else { + DoDumpCommand(); + } +} + +void DBDumperCommand::DoDumpCommand() { + assert(nullptr != db_); + assert(path_.empty()); + + // Parse command line args + uint64_t count = 0; + if (print_stats_) { + std::string stats; + if (db_->GetProperty("rocksdb.stats", &stats)) { + fprintf(stdout, "%s\n", stats.c_str()); + } + } + + // Setup key iterator + ReadOptions scan_read_opts; + scan_read_opts.total_order_seek = true; + Iterator* iter = db_->NewIterator(scan_read_opts, GetCfHandle()); + Status st = iter->status(); + if (!st.ok()) { + exec_state_ = + LDBCommandExecuteResult::Failed("Iterator error." + st.ToString()); + } + + if (!null_from_) { + iter->Seek(from_); + } else { + iter->SeekToFirst(); + } + + int max_keys = max_keys_; + int ttl_start; + if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) { + ttl_start = DBWithTTLImpl::kMinTimestamp; // TTL introduction time + } + int ttl_end; + if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) { + ttl_end = DBWithTTLImpl::kMaxTimestamp; // Max time allowed by TTL feature + } + if (ttl_end < ttl_start) { + fprintf(stderr, "Error: End time can't be less than start time\n"); + delete iter; + return; + } + int time_range = ttl_end - ttl_start; + int bucket_size; + if (!ParseIntOption(option_map_, ARG_TTL_BUCKET, bucket_size, exec_state_) || + bucket_size <= 0) { + bucket_size = time_range; // Will have just 1 bucket by default + } + // cretaing variables for row count of each type + std::string rtype1, rtype2, row, val; + rtype2 = ""; + uint64_t c = 0; + uint64_t s1 = 0, s2 = 0; + + // At this point, bucket_size=0 => time_range=0 + int num_buckets = (bucket_size >= time_range) + ? 1 + : ((time_range + bucket_size - 1) / bucket_size); + std::vector<uint64_t> bucket_counts(num_buckets, 0); + if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) { + fprintf(stdout, "Dumping key-values from %s to %s\n", + TimeToHumanString(ttl_start).c_str(), + TimeToHumanString(ttl_end).c_str()); + } + + HistogramImpl vsize_hist; + + for (; iter->Valid(); iter->Next()) { + int rawtime = 0; + // If end marker was specified, we stop before it + if (!null_to_ && (iter->key().ToString() >= to_)) break; + // Terminate if maximum number of keys have been dumped + if (max_keys == 0) break; + if (is_db_ttl_) { + TtlIterator* it_ttl = static_cast_with_check<TtlIterator>(iter); + rawtime = it_ttl->ttl_timestamp(); + if (rawtime < ttl_start || rawtime >= ttl_end) { + continue; + } + } + if (max_keys > 0) { + --max_keys; + } + if (is_db_ttl_ && num_buckets > 1) { + IncBucketCounts(bucket_counts, ttl_start, time_range, bucket_size, + rawtime, num_buckets); + } + ++count; + if (count_delim_) { + rtype1 = ""; + row = iter->key().ToString(); + val = iter->value().ToString(); + s1 = row.size() + val.size(); + for (int j = 0; row[j] != delim_[0] && row[j] != '\0'; j++) + rtype1 += row[j]; + if (rtype2.compare("") && rtype2.compare(rtype1) != 0) { + fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n", + rtype2.c_str(), c, s2); + c = 1; + s2 = s1; + rtype2 = rtype1; + } else { + c++; + s2 += s1; + rtype2 = rtype1; + } + } + + if (count_only_) { + vsize_hist.Add(iter->value().size()); + } + + if (!count_only_ && !count_delim_) { + if (is_db_ttl_ && timestamp_) { + fprintf(stdout, "%s ", TimeToHumanString(rawtime).c_str()); + } + std::string str = + PrintKeyValue(iter->key().ToString(), iter->value().ToString(), + is_key_hex_, is_value_hex_); + fprintf(stdout, "%s\n", str.c_str()); + } + } + + if (num_buckets > 1 && is_db_ttl_) { + PrintBucketCounts(bucket_counts, ttl_start, ttl_end, bucket_size, + num_buckets); + } else if (count_delim_) { + fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n", + rtype2.c_str(), c, s2); + } else { + fprintf(stdout, "Keys in range: %" PRIu64 "\n", count); + } + + if (count_only_) { + fprintf(stdout, "Value size distribution: \n"); + fprintf(stdout, "%s\n", vsize_hist.ToString().c_str()); + } + // Clean up + delete iter; +} + +const std::string ReduceDBLevelsCommand::ARG_NEW_LEVELS = "new_levels"; +const std::string ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS = + "print_old_levels"; + +ReduceDBLevelsCommand::ReduceDBLevelsCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_NEW_LEVELS, ARG_PRINT_OLD_LEVELS})), + old_levels_(1 << 7), + new_levels_(-1), + print_old_levels_(false) { + ParseIntOption(option_map_, ARG_NEW_LEVELS, new_levels_, exec_state_); + print_old_levels_ = IsFlagPresent(flags, ARG_PRINT_OLD_LEVELS); + + if (new_levels_ <= 0) { + exec_state_ = LDBCommandExecuteResult::Failed( + " Use --" + ARG_NEW_LEVELS + " to specify a new level number\n"); + } +} + +std::vector<std::string> ReduceDBLevelsCommand::PrepareArgs( + const std::string& db_path, int new_levels, bool print_old_level) { + std::vector<std::string> ret; + ret.push_back("reduce_levels"); + ret.push_back("--" + ARG_DB + "=" + db_path); + ret.push_back("--" + ARG_NEW_LEVELS + "=" + std::to_string(new_levels)); + if (print_old_level) { + ret.push_back("--" + ARG_PRINT_OLD_LEVELS); + } + return ret; +} + +void ReduceDBLevelsCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(ReduceDBLevelsCommand::Name()); + ret.append(" --" + ARG_NEW_LEVELS + "=<New number of levels>"); + ret.append(" [--" + ARG_PRINT_OLD_LEVELS + "]"); + ret.append("\n"); +} + +void ReduceDBLevelsCommand::OverrideBaseCFOptions( + ColumnFamilyOptions* cf_opts) { + LDBCommand::OverrideBaseCFOptions(cf_opts); + cf_opts->num_levels = old_levels_; + cf_opts->max_bytes_for_level_multiplier_additional.resize(cf_opts->num_levels, + 1); + // Disable size compaction + cf_opts->max_bytes_for_level_base = 1ULL << 50; + cf_opts->max_bytes_for_level_multiplier = 1; +} + +Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, int* levels) { + ImmutableDBOptions db_options(opt); + EnvOptions soptions; + std::shared_ptr<Cache> tc( + NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits)); + const InternalKeyComparator cmp(opt.comparator); + WriteController wc(opt.delayed_write_rate); + WriteBufferManager wb(opt.db_write_buffer_size); + VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id*/ "", /*db_session_id*/ ""); + std::vector<ColumnFamilyDescriptor> dummy; + ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName, + ColumnFamilyOptions(opt)); + dummy.push_back(dummy_descriptor); + // We rely the VersionSet::Recover to tell us the internal data structures + // in the db. And the Recover() should never do any change + // (like LogAndApply) to the manifest file. + Status st = versions.Recover(dummy); + if (!st.ok()) { + return st; + } + int max = -1; + auto default_cfd = versions.GetColumnFamilySet()->GetDefault(); + for (int i = 0; i < default_cfd->NumberLevels(); i++) { + if (default_cfd->current()->storage_info()->NumLevelFiles(i)) { + max = i; + } + } + + *levels = max + 1; + return st; +} + +void ReduceDBLevelsCommand::DoCommand() { + if (new_levels_ <= 1) { + exec_state_ = + LDBCommandExecuteResult::Failed("Invalid number of levels.\n"); + return; + } + + Status st; + PrepareOptions(); + int old_level_num = -1; + st = GetOldNumOfLevels(options_, &old_level_num); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed(st.ToString()); + return; + } + + if (print_old_levels_) { + fprintf(stdout, "The old number of levels in use is %d\n", old_level_num); + } + + if (old_level_num <= new_levels_) { + return; + } + + old_levels_ = old_level_num; + + OpenDB(); + if (exec_state_.IsFailed()) { + return; + } + assert(db_ != nullptr); + // Compact the whole DB to put all files to the highest level. + fprintf(stdout, "Compacting the db...\n"); + st = + db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, nullptr); + + CloseDB(); + + if (st.ok()) { + EnvOptions soptions; + st = VersionSet::ReduceNumberOfLevels(db_path_, &options_, soptions, + new_levels_); + } + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed(st.ToString()); + return; + } +} + +const std::string ChangeCompactionStyleCommand::ARG_OLD_COMPACTION_STYLE = + "old_compaction_style"; +const std::string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE = + "new_compaction_style"; + +ChangeCompactionStyleCommand::ChangeCompactionStyleCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false, + BuildCmdLineOptions( + {ARG_OLD_COMPACTION_STYLE, ARG_NEW_COMPACTION_STYLE})), + old_compaction_style_(-1), + new_compaction_style_(-1) { + ParseIntOption(option_map_, ARG_OLD_COMPACTION_STYLE, old_compaction_style_, + exec_state_); + if (old_compaction_style_ != kCompactionStyleLevel && + old_compaction_style_ != kCompactionStyleUniversal) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Use --" + ARG_OLD_COMPACTION_STYLE + " to specify old compaction " + + "style. Check ldb help for proper compaction style value.\n"); + return; + } + + ParseIntOption(option_map_, ARG_NEW_COMPACTION_STYLE, new_compaction_style_, + exec_state_); + if (new_compaction_style_ != kCompactionStyleLevel && + new_compaction_style_ != kCompactionStyleUniversal) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Use --" + ARG_NEW_COMPACTION_STYLE + " to specify new compaction " + + "style. Check ldb help for proper compaction style value.\n"); + return; + } + + if (new_compaction_style_ == old_compaction_style_) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Old compaction style is the same as new compaction style. " + "Nothing to do.\n"); + return; + } + + if (old_compaction_style_ == kCompactionStyleUniversal && + new_compaction_style_ == kCompactionStyleLevel) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Convert from universal compaction to level compaction. " + "Nothing to do.\n"); + return; + } +} + +void ChangeCompactionStyleCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(ChangeCompactionStyleCommand::Name()); + ret.append(" --" + ARG_OLD_COMPACTION_STYLE + "=<Old compaction style: 0 " + + "for level compaction, 1 for universal compaction>"); + ret.append(" --" + ARG_NEW_COMPACTION_STYLE + "=<New compaction style: 0 " + + "for level compaction, 1 for universal compaction>"); + ret.append("\n"); +} + +void ChangeCompactionStyleCommand::OverrideBaseCFOptions( + ColumnFamilyOptions* cf_opts) { + LDBCommand::OverrideBaseCFOptions(cf_opts); + if (old_compaction_style_ == kCompactionStyleLevel && + new_compaction_style_ == kCompactionStyleUniversal) { + // In order to convert from level compaction to universal compaction, we + // need to compact all data into a single file and move it to level 0. + cf_opts->disable_auto_compactions = true; + cf_opts->target_file_size_base = INT_MAX; + cf_opts->target_file_size_multiplier = 1; + cf_opts->max_bytes_for_level_base = INT_MAX; + cf_opts->max_bytes_for_level_multiplier = 1; + } +} + +void ChangeCompactionStyleCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + // print db stats before we have made any change + std::string property; + std::string files_per_level; + for (int i = 0; i < db_->NumberLevels(GetCfHandle()); i++) { + db_->GetProperty(GetCfHandle(), + "rocksdb.num-files-at-level" + std::to_string(i), + &property); + + // format print string + char buf[100]; + snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str()); + files_per_level += buf; + } + fprintf(stdout, "files per level before compaction: %s\n", + files_per_level.c_str()); + + // manual compact into a single file and move the file to level 0 + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 0; + Status s = + db_->CompactRange(compact_options, GetCfHandle(), nullptr, nullptr); + if (!s.ok()) { + std::stringstream oss; + oss << "Compaction failed: " << s.ToString(); + exec_state_ = LDBCommandExecuteResult::Failed(oss.str()); + return; + } + + // verify compaction result + files_per_level = ""; + int num_files = 0; + for (int i = 0; i < db_->NumberLevels(GetCfHandle()); i++) { + db_->GetProperty(GetCfHandle(), + "rocksdb.num-files-at-level" + std::to_string(i), + &property); + + // format print string + char buf[100]; + snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str()); + files_per_level += buf; + + num_files = atoi(property.c_str()); + + // level 0 should have only 1 file + if (i == 0 && num_files != 1) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Number of db files at " + "level 0 after compaction is " + + std::to_string(num_files) + ", not 1.\n"); + return; + } + // other levels should have no file + if (i > 0 && num_files != 0) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Number of db files at " + "level " + + std::to_string(i) + " after compaction is " + + std::to_string(num_files) + ", not 0.\n"); + return; + } + } + + fprintf(stdout, "files per level after compaction: %s\n", + files_per_level.c_str()); +} + +// ---------------------------------------------------------------------------- + +namespace { + +struct StdErrReporter : public log::Reader::Reporter { + void Corruption(size_t /*bytes*/, const Status& s) override { + std::cerr << "Corruption detected in log file " << s.ToString() << "\n"; + } +}; + +class InMemoryHandler : public WriteBatch::Handler { + public: + InMemoryHandler(std::stringstream& row, bool print_values, + bool write_after_commit = false) + : Handler(), + row_(row), + print_values_(print_values), + write_after_commit_(write_after_commit) {} + + void commonPutMerge(const Slice& key, const Slice& value) { + std::string k = LDBCommand::StringToHex(key.ToString()); + if (print_values_) { + std::string v = LDBCommand::StringToHex(value.ToString()); + row_ << k << " : "; + row_ << v << " "; + } else { + row_ << k << " "; + } + } + + Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override { + row_ << "PUT(" << cf << ") : "; + commonPutMerge(key, value); + return Status::OK(); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override { + row_ << "MERGE(" << cf << ") : "; + commonPutMerge(key, value); + return Status::OK(); + } + + Status MarkNoop(bool) override { + row_ << "NOOP "; + return Status::OK(); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + row_ << "DELETE(" << cf << ") : "; + row_ << LDBCommand::StringToHex(key.ToString()) << " "; + return Status::OK(); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + row_ << "SINGLE_DELETE(" << cf << ") : "; + row_ << LDBCommand::StringToHex(key.ToString()) << " "; + return Status::OK(); + } + + Status DeleteRangeCF(uint32_t cf, const Slice& begin_key, + const Slice& end_key) override { + row_ << "DELETE_RANGE(" << cf << ") : "; + row_ << LDBCommand::StringToHex(begin_key.ToString()) << " "; + row_ << LDBCommand::StringToHex(end_key.ToString()) << " "; + return Status::OK(); + } + + Status MarkBeginPrepare(bool unprepare) override { + row_ << "BEGIN_PREPARE("; + row_ << (unprepare ? "true" : "false") << ") "; + return Status::OK(); + } + + Status MarkEndPrepare(const Slice& xid) override { + row_ << "END_PREPARE("; + row_ << LDBCommand::StringToHex(xid.ToString()) << ") "; + return Status::OK(); + } + + Status MarkRollback(const Slice& xid) override { + row_ << "ROLLBACK("; + row_ << LDBCommand::StringToHex(xid.ToString()) << ") "; + return Status::OK(); + } + + Status MarkCommit(const Slice& xid) override { + row_ << "COMMIT("; + row_ << LDBCommand::StringToHex(xid.ToString()) << ") "; + return Status::OK(); + } + + Status MarkCommitWithTimestamp(const Slice& xid, + const Slice& commit_ts) override { + row_ << "COMMIT_WITH_TIMESTAMP("; + row_ << LDBCommand::StringToHex(xid.ToString()) << ", "; + row_ << LDBCommand::StringToHex(commit_ts.ToString()) << ") "; + return Status::OK(); + } + + ~InMemoryHandler() override {} + + protected: + Handler::OptionState WriteAfterCommit() const override { + return write_after_commit_ ? Handler::OptionState::kEnabled + : Handler::OptionState::kDisabled; + } + + private: + std::stringstream& row_; + bool print_values_; + bool write_after_commit_; +}; + +void DumpWalFile(Options options, std::string wal_file, bool print_header, + bool print_values, bool is_write_committed, + LDBCommandExecuteResult* exec_state) { + const auto& fs = options.env->GetFileSystem(); + FileOptions soptions(options); + std::unique_ptr<SequentialFileReader> wal_file_reader; + Status status = SequentialFileReader::Create( + fs, wal_file, soptions, &wal_file_reader, nullptr /* dbg */, + nullptr /* rate_limiter */); + if (!status.ok()) { + if (exec_state) { + *exec_state = LDBCommandExecuteResult::Failed("Failed to open WAL file " + + status.ToString()); + } else { + std::cerr << "Error: Failed to open WAL file " << status.ToString() + << std::endl; + } + } else { + StdErrReporter reporter; + uint64_t log_number; + FileType type; + + // we need the log number, but ParseFilename expects dbname/NNN.log. + std::string sanitized = wal_file; + size_t lastslash = sanitized.rfind('/'); + if (lastslash != std::string::npos) + sanitized = sanitized.substr(lastslash + 1); + if (!ParseFileName(sanitized, &log_number, &type)) { + // bogus input, carry on as best we can + log_number = 0; + } + log::Reader reader(options.info_log, std::move(wal_file_reader), &reporter, + true /* checksum */, log_number); + std::string scratch; + WriteBatch batch; + Slice record; + std::stringstream row; + if (print_header) { + std::cout << "Sequence,Count,ByteSize,Physical Offset,Key(s)"; + if (print_values) { + std::cout << " : value "; + } + std::cout << "\n"; + } + while (status.ok() && reader.ReadRecord(&record, &scratch)) { + row.str(""); + if (record.size() < WriteBatchInternal::kHeader) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + } else { + status = WriteBatchInternal::SetContents(&batch, record); + if (!status.ok()) { + std::stringstream oss; + oss << "Parsing write batch failed: " << status.ToString(); + if (exec_state) { + *exec_state = LDBCommandExecuteResult::Failed(oss.str()); + } else { + std::cerr << oss.str() << std::endl; + } + break; + } + row << WriteBatchInternal::Sequence(&batch) << ","; + row << WriteBatchInternal::Count(&batch) << ","; + row << WriteBatchInternal::ByteSize(&batch) << ","; + row << reader.LastRecordOffset() << ","; + InMemoryHandler handler(row, print_values, is_write_committed); + status = batch.Iterate(&handler); + if (!status.ok()) { + if (exec_state) { + std::stringstream oss; + oss << "Print write batch error: " << status.ToString(); + *exec_state = LDBCommandExecuteResult::Failed(oss.str()); + } + row << "error: " << status.ToString(); + break; + } + row << "\n"; + } + std::cout << row.str(); + } + } +} + +} // namespace + +const std::string WALDumperCommand::ARG_WAL_FILE = "walfile"; +const std::string WALDumperCommand::ARG_WRITE_COMMITTED = "write_committed"; +const std::string WALDumperCommand::ARG_PRINT_VALUE = "print_value"; +const std::string WALDumperCommand::ARG_PRINT_HEADER = "header"; + +WALDumperCommand::WALDumperCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, true, + BuildCmdLineOptions({ARG_WAL_FILE, ARG_WRITE_COMMITTED, + ARG_PRINT_HEADER, ARG_PRINT_VALUE})), + print_header_(false), + print_values_(false), + is_write_committed_(false) { + wal_file_.clear(); + + auto itr = options.find(ARG_WAL_FILE); + if (itr != options.end()) { + wal_file_ = itr->second; + } + + print_header_ = IsFlagPresent(flags, ARG_PRINT_HEADER); + print_values_ = IsFlagPresent(flags, ARG_PRINT_VALUE); + is_write_committed_ = ParseBooleanOption(options, ARG_WRITE_COMMITTED, true); + + if (wal_file_.empty()) { + exec_state_ = LDBCommandExecuteResult::Failed("Argument " + ARG_WAL_FILE + + " must be specified."); + } +} + +void WALDumperCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(WALDumperCommand::Name()); + ret.append(" --" + ARG_WAL_FILE + "=<write_ahead_log_file_path>"); + ret.append(" [--" + ARG_PRINT_HEADER + "] "); + ret.append(" [--" + ARG_PRINT_VALUE + "] "); + ret.append(" [--" + ARG_WRITE_COMMITTED + "=true|false] "); + ret.append("\n"); +} + +void WALDumperCommand::DoCommand() { + DumpWalFile(options_, wal_file_, print_header_, print_values_, + is_write_committed_, &exec_state_); +} + +// ---------------------------------------------------------------------------- + +GetCommand::GetCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand( + options, flags, true, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) { + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::Failed( + "<key> must be specified for the get command"); + } else { + key_ = params.at(0); + } + + if (is_key_hex_) { + key_ = HexToString(key_); + } +} + +void GetCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(GetCommand::Name()); + ret.append(" <key>"); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void GetCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + std::string value; + Status st = db_->Get(ReadOptions(), GetCfHandle(), key_, &value); + if (st.ok()) { + fprintf(stdout, "%s\n", + (is_value_hex_ ? StringToHex(value) : value).c_str()); + } else { + std::stringstream oss; + oss << "Get failed: " << st.ToString(); + exec_state_ = LDBCommandExecuteResult::Failed(oss.str()); + } +} + +// ---------------------------------------------------------------------------- + +ApproxSizeCommand::ApproxSizeCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, true, + BuildCmdLineOptions( + {ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO})) { + if (options.find(ARG_FROM) != options.end()) { + start_key_ = options.find(ARG_FROM)->second; + } else { + exec_state_ = LDBCommandExecuteResult::Failed( + ARG_FROM + " must be specified for approxsize command"); + return; + } + + if (options.find(ARG_TO) != options.end()) { + end_key_ = options.find(ARG_TO)->second; + } else { + exec_state_ = LDBCommandExecuteResult::Failed( + ARG_TO + " must be specified for approxsize command"); + return; + } + + if (is_key_hex_) { + start_key_ = HexToString(start_key_); + end_key_ = HexToString(end_key_); + } +} + +void ApproxSizeCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(ApproxSizeCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append("\n"); +} + +void ApproxSizeCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + Range ranges[1]; + ranges[0] = Range(start_key_, end_key_); + uint64_t sizes[1]; + Status s = db_->GetApproximateSizes(GetCfHandle(), ranges, 1, sizes); + if (!s.ok()) { + std::stringstream oss; + oss << "ApproximateSize failed: " << s.ToString(); + exec_state_ = LDBCommandExecuteResult::Failed(oss.str()); + } else { + fprintf(stdout, "%lu\n", (unsigned long)sizes[0]); + } +} + +// ---------------------------------------------------------------------------- + +BatchPutCommand::BatchPutCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, + ARG_VALUE_HEX, ARG_CREATE_IF_MISSING})) { + if (params.size() < 2) { + exec_state_ = LDBCommandExecuteResult::Failed( + "At least one <key> <value> pair must be specified batchput."); + } else if (params.size() % 2 != 0) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Equal number of <key>s and <value>s must be specified for batchput."); + } else { + for (size_t i = 0; i < params.size(); i += 2) { + std::string key = params.at(i); + std::string value = params.at(i + 1); + key_values_.push_back(std::pair<std::string, std::string>( + is_key_hex_ ? HexToString(key) : key, + is_value_hex_ ? HexToString(value) : value)); + } + } + create_if_missing_ = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING); +} + +void BatchPutCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(BatchPutCommand::Name()); + ret.append(" <key> <value> [<key> <value>] [..]"); + ret.append(" [--" + ARG_CREATE_IF_MISSING + "]"); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void BatchPutCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + WriteBatch batch; + + Status st; + std::stringstream oss; + for (std::vector<std::pair<std::string, std::string>>::const_iterator itr = + key_values_.begin(); + itr != key_values_.end(); ++itr) { + st = batch.Put(GetCfHandle(), itr->first, itr->second); + if (!st.ok()) { + oss << "Put to write batch failed: " << itr->first << "=>" << itr->second + << " error: " << st.ToString(); + break; + } + } + if (st.ok()) { + st = db_->Write(WriteOptions(), &batch); + if (!st.ok()) { + oss << "Write failed: " << st.ToString(); + } + } + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed(oss.str()); + } +} + +void BatchPutCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + options_.create_if_missing = create_if_missing_; +} + +// ---------------------------------------------------------------------------- + +ScanCommand::ScanCommand(const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand( + options, flags, true, + BuildCmdLineOptions({ARG_TTL, ARG_NO_VALUE, ARG_HEX, ARG_KEY_HEX, + ARG_TO, ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP, + ARG_MAX_KEYS, ARG_TTL_START, ARG_TTL_END})), + start_key_specified_(false), + end_key_specified_(false), + max_keys_scanned_(-1), + no_value_(false) { + auto itr = options.find(ARG_FROM); + if (itr != options.end()) { + start_key_ = itr->second; + if (is_key_hex_) { + start_key_ = HexToString(start_key_); + } + start_key_specified_ = true; + } + itr = options.find(ARG_TO); + if (itr != options.end()) { + end_key_ = itr->second; + if (is_key_hex_) { + end_key_ = HexToString(end_key_); + } + end_key_specified_ = true; + } + + std::vector<std::string>::const_iterator vitr = + std::find(flags.begin(), flags.end(), ARG_NO_VALUE); + if (vitr != flags.end()) { + no_value_ = true; + } + + itr = options.find(ARG_MAX_KEYS); + if (itr != options.end()) { + try { +#if defined(CYGWIN) + max_keys_scanned_ = strtol(itr->second.c_str(), 0, 10); +#else + max_keys_scanned_ = std::stoi(itr->second); +#endif + } catch (const std::invalid_argument&) { + exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS + + " has an invalid value"); + } catch (const std::out_of_range&) { + exec_state_ = LDBCommandExecuteResult::Failed( + ARG_MAX_KEYS + " has a value out-of-range"); + } + } +} + +void ScanCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(ScanCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_TTL + "]"); + ret.append(" [--" + ARG_TIMESTAMP + "]"); + ret.append(" [--" + ARG_MAX_KEYS + "=<N>q] "); + ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]"); + ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]"); + ret.append(" [--" + ARG_NO_VALUE + "]"); + ret.append("\n"); +} + +void ScanCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + + int num_keys_scanned = 0; + ReadOptions scan_read_opts; + scan_read_opts.total_order_seek = true; + Iterator* it = db_->NewIterator(scan_read_opts, GetCfHandle()); + if (start_key_specified_) { + it->Seek(start_key_); + } else { + it->SeekToFirst(); + } + int ttl_start; + if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) { + ttl_start = DBWithTTLImpl::kMinTimestamp; // TTL introduction time + } + int ttl_end; + if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) { + ttl_end = DBWithTTLImpl::kMaxTimestamp; // Max time allowed by TTL feature + } + if (ttl_end < ttl_start) { + fprintf(stderr, "Error: End time can't be less than start time\n"); + delete it; + return; + } + if (is_db_ttl_ && timestamp_) { + fprintf(stdout, "Scanning key-values from %s to %s\n", + TimeToHumanString(ttl_start).c_str(), + TimeToHumanString(ttl_end).c_str()); + } + for (; + it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_); + it->Next()) { + if (is_db_ttl_) { + TtlIterator* it_ttl = static_cast_with_check<TtlIterator>(it); + int rawtime = it_ttl->ttl_timestamp(); + if (rawtime < ttl_start || rawtime >= ttl_end) { + continue; + } + if (timestamp_) { + fprintf(stdout, "%s ", TimeToHumanString(rawtime).c_str()); + } + } + + Slice key_slice = it->key(); + + std::string formatted_key; + if (is_key_hex_) { + formatted_key = "0x" + key_slice.ToString(true /* hex */); + key_slice = formatted_key; + } else if (ldb_options_.key_formatter) { + formatted_key = ldb_options_.key_formatter->Format(key_slice); + key_slice = formatted_key; + } + + if (no_value_) { + fprintf(stdout, "%.*s\n", static_cast<int>(key_slice.size()), + key_slice.data()); + } else { + Slice val_slice = it->value(); + std::string formatted_value; + if (is_value_hex_) { + formatted_value = "0x" + val_slice.ToString(true /* hex */); + val_slice = formatted_value; + } + fprintf(stdout, "%.*s : %.*s\n", static_cast<int>(key_slice.size()), + key_slice.data(), static_cast<int>(val_slice.size()), + val_slice.data()); + } + + num_keys_scanned++; + if (max_keys_scanned_ >= 0 && num_keys_scanned >= max_keys_scanned_) { + break; + } + } + if (!it->status().ok()) { // Check for any errors found during the scan + exec_state_ = LDBCommandExecuteResult::Failed(it->status().ToString()); + } + delete it; +} + +// ---------------------------------------------------------------------------- + +DeleteCommand::DeleteCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) { + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::Failed( + "KEY must be specified for the delete command"); + } else { + key_ = params.at(0); + if (is_key_hex_) { + key_ = HexToString(key_); + } + } +} + +void DeleteCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(DeleteCommand::Name() + " <key>"); + ret.append("\n"); +} + +void DeleteCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + Status st = db_->Delete(WriteOptions(), GetCfHandle(), key_); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed(st.ToString()); + } +} + +SingleDeleteCommand::SingleDeleteCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) { + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::Failed( + "KEY must be specified for the single delete command"); + } else { + key_ = params.at(0); + if (is_key_hex_) { + key_ = HexToString(key_); + } + } +} + +void SingleDeleteCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(SingleDeleteCommand::Name() + " <key>"); + ret.append("\n"); +} + +void SingleDeleteCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + Status st = db_->SingleDelete(WriteOptions(), GetCfHandle(), key_); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed(st.ToString()); + } +} + +DeleteRangeCommand::DeleteRangeCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) { + if (params.size() != 2) { + exec_state_ = LDBCommandExecuteResult::Failed( + "begin and end keys must be specified for the delete command"); + } else { + begin_key_ = params.at(0); + end_key_ = params.at(1); + if (is_key_hex_) { + begin_key_ = HexToString(begin_key_); + end_key_ = HexToString(end_key_); + } + } +} + +void DeleteRangeCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(DeleteRangeCommand::Name() + " <begin key> <end key>"); + ret.append("\n"); +} + +void DeleteRangeCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + Status st = + db_->DeleteRange(WriteOptions(), GetCfHandle(), begin_key_, end_key_); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed(st.ToString()); + } +} + +PutCommand::PutCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, + ARG_VALUE_HEX, ARG_CREATE_IF_MISSING})) { + if (params.size() != 2) { + exec_state_ = LDBCommandExecuteResult::Failed( + "<key> and <value> must be specified for the put command"); + } else { + key_ = params.at(0); + value_ = params.at(1); + } + + if (is_key_hex_) { + key_ = HexToString(key_); + } + + if (is_value_hex_) { + value_ = HexToString(value_); + } + create_if_missing_ = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING); +} + +void PutCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(PutCommand::Name()); + ret.append(" <key> <value>"); + ret.append(" [--" + ARG_CREATE_IF_MISSING + "]"); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void PutCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + Status st = db_->Put(WriteOptions(), GetCfHandle(), key_, value_); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed(st.ToString()); + } +} + +void PutCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + options_.create_if_missing = create_if_missing_; +} + +// ---------------------------------------------------------------------------- + +const char* DBQuerierCommand::HELP_CMD = "help"; +const char* DBQuerierCommand::GET_CMD = "get"; +const char* DBQuerierCommand::PUT_CMD = "put"; +const char* DBQuerierCommand::DELETE_CMD = "delete"; + +DBQuerierCommand::DBQuerierCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand( + options, flags, false, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) { + +} + +void DBQuerierCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(DBQuerierCommand::Name()); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); + ret.append( + " Starts a REPL shell. Type help for list of available " + "commands."); + ret.append("\n"); +} + +void DBQuerierCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + + ReadOptions read_options; + WriteOptions write_options; + + std::string line; + std::string key; + std::string value; + Status s; + std::stringstream oss; + while (s.ok() && getline(std::cin, line, '\n')) { + // Parse line into std::vector<std::string> + std::vector<std::string> tokens; + size_t pos = 0; + while (true) { + size_t pos2 = line.find(' ', pos); + if (pos2 == std::string::npos) { + break; + } + tokens.push_back(line.substr(pos, pos2 - pos)); + pos = pos2 + 1; + } + tokens.push_back(line.substr(pos)); + + const std::string& cmd = tokens[0]; + + if (cmd == HELP_CMD) { + fprintf(stdout, + "get <key>\n" + "put <key> <value>\n" + "delete <key>\n"); + } else if (cmd == DELETE_CMD && tokens.size() == 2) { + key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); + s = db_->Delete(write_options, GetCfHandle(), Slice(key)); + if (s.ok()) { + fprintf(stdout, "Successfully deleted %s\n", tokens[1].c_str()); + } else { + oss << "delete " << key << " failed: " << s.ToString(); + } + } else if (cmd == PUT_CMD && tokens.size() == 3) { + key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); + value = (is_value_hex_ ? HexToString(tokens[2]) : tokens[2]); + s = db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value)); + if (s.ok()) { + fprintf(stdout, "Successfully put %s %s\n", tokens[1].c_str(), + tokens[2].c_str()); + } else { + oss << "put " << key << "=>" << value << " failed: " << s.ToString(); + } + } else if (cmd == GET_CMD && tokens.size() == 2) { + key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); + s = db_->Get(read_options, GetCfHandle(), Slice(key), &value); + if (s.ok()) { + fprintf(stdout, "%s\n", + PrintKeyValue(key, value, is_key_hex_, is_value_hex_).c_str()); + } else { + if (s.IsNotFound()) { + fprintf(stdout, "Not found %s\n", tokens[1].c_str()); + } else { + oss << "get " << key << " error: " << s.ToString(); + } + } + } else { + fprintf(stdout, "Unknown command %s\n", line.c_str()); + } + } + if (!s.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed(oss.str()); + } +} + +// ---------------------------------------------------------------------------- + +CheckConsistencyCommand::CheckConsistencyCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {} + +void CheckConsistencyCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(CheckConsistencyCommand::Name()); + ret.append("\n"); +} + +void CheckConsistencyCommand::DoCommand() { + options_.paranoid_checks = true; + options_.num_levels = 64; + OpenDB(); + if (exec_state_.IsSucceed() || exec_state_.IsNotStarted()) { + fprintf(stdout, "OK\n"); + } + CloseDB(); +} + +// ---------------------------------------------------------------------------- + +const std::string CheckPointCommand::ARG_CHECKPOINT_DIR = "checkpoint_dir"; + +CheckPointCommand::CheckPointCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false /* is_read_only */, + BuildCmdLineOptions({ARG_CHECKPOINT_DIR})) { + auto itr = options.find(ARG_CHECKPOINT_DIR); + if (itr != options.end()) { + checkpoint_dir_ = itr->second; + } +} + +void CheckPointCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(CheckPointCommand::Name()); + ret.append(" [--" + ARG_CHECKPOINT_DIR + "] "); + ret.append("\n"); +} + +void CheckPointCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + Checkpoint* checkpoint; + Status status = Checkpoint::Create(db_, &checkpoint); + status = checkpoint->CreateCheckpoint(checkpoint_dir_); + if (status.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed(status.ToString()); + } +} + +// ---------------------------------------------------------------------------- + +const std::string RepairCommand::ARG_VERBOSE = "verbose"; + +RepairCommand::RepairCommand(const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false, BuildCmdLineOptions({ARG_VERBOSE})) { + verbose_ = IsFlagPresent(flags, ARG_VERBOSE); +} + +void RepairCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(RepairCommand::Name()); + ret.append(" [--" + ARG_VERBOSE + "]"); + ret.append("\n"); +} + +void RepairCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + auto level = verbose_ ? InfoLogLevel::INFO_LEVEL : InfoLogLevel::WARN_LEVEL; + options_.info_log.reset(new StderrLogger(level)); +} + +void RepairCommand::DoCommand() { + PrepareOptions(); + Status status = RepairDB(db_path_, options_); + if (status.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed(status.ToString()); + } +} + +// ---------------------------------------------------------------------------- + +const std::string BackupEngineCommand::ARG_NUM_THREADS = "num_threads"; +const std::string BackupEngineCommand::ARG_BACKUP_ENV_URI = "backup_env_uri"; +const std::string BackupEngineCommand::ARG_BACKUP_FS_URI = "backup_fs_uri"; +const std::string BackupEngineCommand::ARG_BACKUP_DIR = "backup_dir"; +const std::string BackupEngineCommand::ARG_STDERR_LOG_LEVEL = + "stderr_log_level"; + +BackupEngineCommand::BackupEngineCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false /* is_read_only */, + BuildCmdLineOptions({ARG_BACKUP_ENV_URI, ARG_BACKUP_FS_URI, + ARG_BACKUP_DIR, ARG_NUM_THREADS, + ARG_STDERR_LOG_LEVEL})), + num_threads_(1) { + auto itr = options.find(ARG_NUM_THREADS); + if (itr != options.end()) { + num_threads_ = std::stoi(itr->second); + } + itr = options.find(ARG_BACKUP_ENV_URI); + if (itr != options.end()) { + backup_env_uri_ = itr->second; + } + itr = options.find(ARG_BACKUP_FS_URI); + if (itr != options.end()) { + backup_fs_uri_ = itr->second; + } + if (!backup_env_uri_.empty() && !backup_fs_uri_.empty()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "you may not specity both --" + ARG_BACKUP_ENV_URI + " and --" + + ARG_BACKUP_FS_URI); + } + itr = options.find(ARG_BACKUP_DIR); + if (itr == options.end()) { + exec_state_ = LDBCommandExecuteResult::Failed("--" + ARG_BACKUP_DIR + + ": missing backup directory"); + } else { + backup_dir_ = itr->second; + } + + itr = options.find(ARG_STDERR_LOG_LEVEL); + if (itr != options.end()) { + int stderr_log_level = std::stoi(itr->second); + if (stderr_log_level < 0 || + stderr_log_level >= InfoLogLevel::NUM_INFO_LOG_LEVELS) { + exec_state_ = LDBCommandExecuteResult::Failed( + ARG_STDERR_LOG_LEVEL + " must be >= 0 and < " + + std::to_string(InfoLogLevel::NUM_INFO_LOG_LEVELS) + "."); + } else { + logger_.reset( + new StderrLogger(static_cast<InfoLogLevel>(stderr_log_level))); + } + } +} + +void BackupEngineCommand::Help(const std::string& name, std::string& ret) { + ret.append(" "); + ret.append(name); + ret.append(" [--" + ARG_BACKUP_ENV_URI + " | --" + ARG_BACKUP_FS_URI + "] "); + ret.append(" [--" + ARG_BACKUP_DIR + "] "); + ret.append(" [--" + ARG_NUM_THREADS + "] "); + ret.append(" [--" + ARG_STDERR_LOG_LEVEL + "=<int (InfoLogLevel)>] "); + ret.append("\n"); +} + +// ---------------------------------------------------------------------------- + +BackupCommand::BackupCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : BackupEngineCommand(params, options, flags) {} + +void BackupCommand::Help(std::string& ret) { + BackupEngineCommand::Help(Name(), ret); +} + +void BackupCommand::DoCommand() { + BackupEngine* backup_engine; + Status status; + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + fprintf(stdout, "open db OK\n"); + + Env* custom_env = backup_env_guard_.get(); + if (custom_env == nullptr) { + Status s = + Env::CreateFromUri(config_options_, backup_env_uri_, backup_fs_uri_, + &custom_env, &backup_env_guard_); + if (!s.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed(s.ToString()); + return; + } + } + assert(custom_env != nullptr); + + BackupEngineOptions backup_options = + BackupEngineOptions(backup_dir_, custom_env); + backup_options.info_log = logger_.get(); + backup_options.max_background_operations = num_threads_; + status = BackupEngine::Open(options_.env, backup_options, &backup_engine); + if (status.ok()) { + fprintf(stdout, "open backup engine OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed(status.ToString()); + return; + } + status = backup_engine->CreateNewBackup(db_); + if (status.ok()) { + fprintf(stdout, "create new backup OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed(status.ToString()); + return; + } +} + +// ---------------------------------------------------------------------------- + +RestoreCommand::RestoreCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : BackupEngineCommand(params, options, flags) {} + +void RestoreCommand::Help(std::string& ret) { + BackupEngineCommand::Help(Name(), ret); +} + +void RestoreCommand::DoCommand() { + Env* custom_env = backup_env_guard_.get(); + if (custom_env == nullptr) { + Status s = + Env::CreateFromUri(config_options_, backup_env_uri_, backup_fs_uri_, + &custom_env, &backup_env_guard_); + if (!s.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed(s.ToString()); + return; + } + } + assert(custom_env != nullptr); + + std::unique_ptr<BackupEngineReadOnly> restore_engine; + Status status; + { + BackupEngineOptions opts(backup_dir_, custom_env); + opts.info_log = logger_.get(); + opts.max_background_operations = num_threads_; + BackupEngineReadOnly* raw_restore_engine_ptr; + status = + BackupEngineReadOnly::Open(options_.env, opts, &raw_restore_engine_ptr); + if (status.ok()) { + restore_engine.reset(raw_restore_engine_ptr); + } + } + if (status.ok()) { + fprintf(stdout, "open restore engine OK\n"); + status = restore_engine->RestoreDBFromLatestBackup(db_path_, db_path_); + } + if (status.ok()) { + fprintf(stdout, "restore from backup OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed(status.ToString()); + } +} + +// ---------------------------------------------------------------------------- + +namespace { + +void DumpSstFile(Options options, std::string filename, bool output_hex, + bool show_properties, bool decode_blob_index, + std::string from_key, std::string to_key) { + if (filename.length() <= 4 || + filename.rfind(".sst") != filename.length() - 4) { + std::cout << "Invalid sst file name." << std::endl; + return; + } + // no verification + ROCKSDB_NAMESPACE::SstFileDumper dumper( + options, filename, Temperature::kUnknown, + 2 * 1024 * 1024 /* readahead_size */, + /* verify_checksum */ false, output_hex, decode_blob_index); + Status st = dumper.ReadSequential(true, std::numeric_limits<uint64_t>::max(), + !from_key.empty(), from_key, + !to_key.empty(), to_key); + if (!st.ok()) { + std::cerr << "Error in reading SST file " << filename << st.ToString() + << std::endl; + return; + } + + if (show_properties) { + const ROCKSDB_NAMESPACE::TableProperties* table_properties; + + std::shared_ptr<const ROCKSDB_NAMESPACE::TableProperties> + table_properties_from_reader; + st = dumper.ReadTableProperties(&table_properties_from_reader); + if (!st.ok()) { + std::cerr << filename << ": " << st.ToString() + << ". Try to use initial table properties" << std::endl; + table_properties = dumper.GetInitTableProperties(); + } else { + table_properties = table_properties_from_reader.get(); + } + if (table_properties != nullptr) { + std::cout << std::endl << "Table Properties:" << std::endl; + std::cout << table_properties->ToString("\n") << std::endl; + } + } +} + +void DumpBlobFile(const std::string& filename, bool is_key_hex, + bool is_value_hex, bool dump_uncompressed_blobs) { + using ROCKSDB_NAMESPACE::blob_db::BlobDumpTool; + BlobDumpTool tool; + BlobDumpTool::DisplayType blob_type = is_value_hex + ? BlobDumpTool::DisplayType::kHex + : BlobDumpTool::DisplayType::kRaw; + BlobDumpTool::DisplayType show_uncompressed_blob = + dump_uncompressed_blobs ? blob_type : BlobDumpTool::DisplayType::kNone; + BlobDumpTool::DisplayType show_blob = + dump_uncompressed_blobs ? BlobDumpTool::DisplayType::kNone : blob_type; + + BlobDumpTool::DisplayType show_key = is_key_hex + ? BlobDumpTool::DisplayType::kHex + : BlobDumpTool::DisplayType::kRaw; + Status s = tool.Run(filename, show_key, show_blob, show_uncompressed_blob, + /* show_summary */ true); + if (!s.ok()) { + fprintf(stderr, "Failed: %s\n", s.ToString().c_str()); + } +} +} // namespace + +DBFileDumperCommand::DBFileDumperCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, true, + BuildCmdLineOptions( + {ARG_DECODE_BLOB_INDEX, ARG_DUMP_UNCOMPRESSED_BLOBS})), + decode_blob_index_(IsFlagPresent(flags, ARG_DECODE_BLOB_INDEX)), + dump_uncompressed_blobs_( + IsFlagPresent(flags, ARG_DUMP_UNCOMPRESSED_BLOBS)) {} + +void DBFileDumperCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(DBFileDumperCommand::Name()); + ret.append(" [--" + ARG_DECODE_BLOB_INDEX + "] "); + ret.append(" [--" + ARG_DUMP_UNCOMPRESSED_BLOBS + "] "); + ret.append("\n"); +} + +void DBFileDumperCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + Status s; + + // TODO: Use --hex, --key_hex, --value_hex flags consistently for + // dumping manifest file, sst files and blob files. + std::cout << "Manifest File" << std::endl; + std::cout << "==============================" << std::endl; + std::string manifest_filename; + s = ReadFileToString(db_->GetEnv(), CurrentFileName(db_->GetName()), + &manifest_filename); + if (!s.ok() || manifest_filename.empty() || + manifest_filename.back() != '\n') { + std::cerr << "Error when reading CURRENT file " + << CurrentFileName(db_->GetName()) << std::endl; + } + // remove the trailing '\n' + manifest_filename.resize(manifest_filename.size() - 1); + std::string manifest_filepath = db_->GetName() + "/" + manifest_filename; + // Correct concatenation of filepath and filename: + // Check that there is no double slashes (or more!) when concatenation + // happens. + manifest_filepath = NormalizePath(manifest_filepath); + + std::cout << manifest_filepath << std::endl; + DumpManifestFile(options_, manifest_filepath, false, false, false); + std::cout << std::endl; + + std::vector<ColumnFamilyMetaData> column_families; + db_->GetAllColumnFamilyMetaData(&column_families); + for (const auto& column_family : column_families) { + std::cout << "Column family name: " << column_family.name << std::endl; + std::cout << "==============================" << std::endl; + std::cout << std::endl; + std::cout << "SST Files" << std::endl; + std::cout << "==============================" << std::endl; + for (const LevelMetaData& level : column_family.levels) { + for (const SstFileMetaData& sst_file : level.files) { + std::string filename = sst_file.db_path + "/" + sst_file.name; + // Correct concatenation of filepath and filename: + // Check that there is no double slashes (or more!) when concatenation + // happens. + filename = NormalizePath(filename); + std::cout << filename << " level:" << level.level << std::endl; + std::cout << "------------------------------" << std::endl; + DumpSstFile(options_, filename, false, true, decode_blob_index_); + std::cout << std::endl; + } + } + std::cout << "Blob Files" << std::endl; + std::cout << "==============================" << std::endl; + for (const BlobMetaData& blob_file : column_family.blob_files) { + std::string filename = + blob_file.blob_file_path + "/" + blob_file.blob_file_name; + // Correct concatenation of filepath and filename: + // Check that there is no double slashes (or more!) when concatenation + // happens. + filename = NormalizePath(filename); + std::cout << filename << std::endl; + std::cout << "------------------------------" << std::endl; + DumpBlobFile(filename, /* is_key_hex */ false, /* is_value_hex */ false, + dump_uncompressed_blobs_); + std::cout << std::endl; + } + } + std::cout << std::endl; + + std::cout << "Write Ahead Log Files" << std::endl; + std::cout << "==============================" << std::endl; + ROCKSDB_NAMESPACE::VectorLogPtr wal_files; + s = db_->GetSortedWalFiles(wal_files); + if (!s.ok()) { + std::cerr << "Error when getting WAL files" << std::endl; + } else { + std::string wal_dir; + if (options_.wal_dir.empty()) { + wal_dir = db_->GetName(); + } else { + wal_dir = NormalizePath(options_.wal_dir + "/"); + } + for (auto& wal : wal_files) { + // TODO(qyang): option.wal_dir should be passed into ldb command + std::string filename = wal_dir + wal->PathName(); + std::cout << filename << std::endl; + // TODO(myabandeh): allow configuring is_write_commited + DumpWalFile(options_, filename, true, true, true /* is_write_commited */, + &exec_state_); + } + } +} + +const std::string DBLiveFilesMetadataDumperCommand::ARG_SORT_BY_FILENAME = + "sort_by_filename"; + +DBLiveFilesMetadataDumperCommand::DBLiveFilesMetadataDumperCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, true, + BuildCmdLineOptions({ARG_SORT_BY_FILENAME})) { + sort_by_filename_ = IsFlagPresent(flags, ARG_SORT_BY_FILENAME); +} + +void DBLiveFilesMetadataDumperCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(DBLiveFilesMetadataDumperCommand::Name()); + ret.append(" [--" + ARG_SORT_BY_FILENAME + "] "); + ret.append("\n"); +} + +void DBLiveFilesMetadataDumperCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + Status s; + + std::vector<ColumnFamilyMetaData> metadata; + db_->GetAllColumnFamilyMetaData(&metadata); + if (sort_by_filename_) { + std::cout << "Live SST and Blob Files:" << std::endl; + // tuple of <file path, level, column family name> + std::vector<std::tuple<std::string, int, std::string>> all_files; + + for (const auto& column_metadata : metadata) { + // Iterate Levels + const auto& levels = column_metadata.levels; + const std::string& cf = column_metadata.name; + for (const auto& level_metadata : levels) { + // Iterate SST files + const auto& sst_files = level_metadata.files; + int level = level_metadata.level; + for (const auto& sst_metadata : sst_files) { + // The SstFileMetaData.name always starts with "/", + // however SstFileMetaData.db_path is the string provided by + // the user as an input. Therefore we check if we can + // concantenate the two strings directly or if we need to + // drop a possible extra "/" at the end of SstFileMetaData.db_path. + std::string filename = + NormalizePath(sst_metadata.db_path + "/" + sst_metadata.name); + all_files.emplace_back(filename, level, cf); + } // End of for-loop over sst files + } // End of for-loop over levels + + const auto& blob_files = column_metadata.blob_files; + for (const auto& blob_metadata : blob_files) { + // The BlobMetaData.blob_file_name always starts with "/", + // however BlobMetaData.blob_file_path is the string provided by + // the user as an input. Therefore we check if we can + // concantenate the two strings directly or if we need to + // drop a possible extra "/" at the end of BlobMetaData.blob_file_path. + std::string filename = NormalizePath( + blob_metadata.blob_file_path + "/" + blob_metadata.blob_file_name); + // Level for blob files is encoded as -1 + all_files.emplace_back(filename, -1, cf); + } // End of for-loop over blob files + } // End of for-loop over column metadata + + // Sort by filename (i.e. first entry in tuple) + std::sort(all_files.begin(), all_files.end()); + + for (const auto& item : all_files) { + const std::string& filename = std::get<0>(item); + int level = std::get<1>(item); + const std::string& cf = std::get<2>(item); + if (level == -1) { // Blob File + std::cout << filename << ", column family '" << cf << "'" << std::endl; + } else { // SST file + std::cout << filename << " : level " << level << ", column family '" + << cf << "'" << std::endl; + } + } + } else { + for (const auto& column_metadata : metadata) { + std::cout << "===== Column Family: " << column_metadata.name + << " =====" << std::endl; + + std::cout << "Live SST Files:" << std::endl; + // Iterate levels + const auto& levels = column_metadata.levels; + for (const auto& level_metadata : levels) { + std::cout << "---------- level " << level_metadata.level + << " ----------" << std::endl; + // Iterate SST files + const auto& sst_files = level_metadata.files; + for (const auto& sst_metadata : sst_files) { + // The SstFileMetaData.name always starts with "/", + // however SstFileMetaData.db_path is the string provided by + // the user as an input. Therefore we check if we can + // concantenate the two strings directly or if we need to + // drop a possible extra "/" at the end of SstFileMetaData.db_path. + std::string filename = + NormalizePath(sst_metadata.db_path + "/" + sst_metadata.name); + std::cout << filename << std::endl; + } // End of for-loop over sst files + } // End of for-loop over levels + + std::cout << "Live Blob Files:" << std::endl; + const auto& blob_files = column_metadata.blob_files; + for (const auto& blob_metadata : blob_files) { + // The BlobMetaData.blob_file_name always starts with "/", + // however BlobMetaData.blob_file_path is the string provided by + // the user as an input. Therefore we check if we can + // concantenate the two strings directly or if we need to + // drop a possible extra "/" at the end of BlobMetaData.blob_file_path. + std::string filename = NormalizePath( + blob_metadata.blob_file_path + "/" + blob_metadata.blob_file_name); + std::cout << filename << std::endl; + } // End of for-loop over blob files + } // End of for-loop over column metadata + } // End of else ("not sort_by_filename") + std::cout << "------------------------------" << std::endl; +} + +void WriteExternalSstFilesCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(WriteExternalSstFilesCommand::Name()); + ret.append(" <output_sst_path>"); + ret.append("\n"); +} + +WriteExternalSstFilesCommand::WriteExternalSstFilesCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand( + options, flags, false /* is_read_only */, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, + ARG_TO, ARG_CREATE_IF_MISSING})) { + create_if_missing_ = + IsFlagPresent(flags, ARG_CREATE_IF_MISSING) || + ParseBooleanOption(options, ARG_CREATE_IF_MISSING, false); + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::Failed( + "output SST file path must be specified"); + } else { + output_sst_path_ = params.at(0); + } +} + +void WriteExternalSstFilesCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + ColumnFamilyHandle* cfh = GetCfHandle(); + SstFileWriter sst_file_writer(EnvOptions(), db_->GetOptions(), cfh); + Status status = sst_file_writer.Open(output_sst_path_); + if (!status.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed("failed to open SST file: " + + status.ToString()); + return; + } + + int bad_lines = 0; + std::string line; + std::ifstream ifs_stdin("/dev/stdin"); + std::istream* istream_p = ifs_stdin.is_open() ? &ifs_stdin : &std::cin; + while (getline(*istream_p, line, '\n')) { + std::string key; + std::string value; + if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) { + status = sst_file_writer.Put(key, value); + if (!status.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "failed to write record to file: " + status.ToString()); + return; + } + } else if (0 == line.find("Keys in range:")) { + // ignore this line + } else if (0 == line.find("Created bg thread 0x")) { + // ignore this line + } else { + bad_lines++; + } + } + + status = sst_file_writer.Finish(); + if (!status.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Failed to finish writing to file: " + status.ToString()); + return; + } + + if (bad_lines > 0) { + fprintf(stderr, "Warning: %d bad lines ignored.\n", bad_lines); + } + exec_state_ = LDBCommandExecuteResult::Succeed( + "external SST file written to " + output_sst_path_); +} + +void WriteExternalSstFilesCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + options_.create_if_missing = create_if_missing_; +} + +const std::string IngestExternalSstFilesCommand::ARG_MOVE_FILES = "move_files"; +const std::string IngestExternalSstFilesCommand::ARG_SNAPSHOT_CONSISTENCY = + "snapshot_consistency"; +const std::string IngestExternalSstFilesCommand::ARG_ALLOW_GLOBAL_SEQNO = + "allow_global_seqno"; +const std::string IngestExternalSstFilesCommand::ARG_ALLOW_BLOCKING_FLUSH = + "allow_blocking_flush"; +const std::string IngestExternalSstFilesCommand::ARG_INGEST_BEHIND = + "ingest_behind"; +const std::string IngestExternalSstFilesCommand::ARG_WRITE_GLOBAL_SEQNO = + "write_global_seqno"; + +void IngestExternalSstFilesCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(IngestExternalSstFilesCommand::Name()); + ret.append(" <input_sst_path>"); + ret.append(" [--" + ARG_MOVE_FILES + "] "); + ret.append(" [--" + ARG_SNAPSHOT_CONSISTENCY + "] "); + ret.append(" [--" + ARG_ALLOW_GLOBAL_SEQNO + "] "); + ret.append(" [--" + ARG_ALLOW_BLOCKING_FLUSH + "] "); + ret.append(" [--" + ARG_INGEST_BEHIND + "] "); + ret.append(" [--" + ARG_WRITE_GLOBAL_SEQNO + "] "); + ret.append("\n"); +} + +IngestExternalSstFilesCommand::IngestExternalSstFilesCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand( + options, flags, false /* is_read_only */, + BuildCmdLineOptions({ARG_MOVE_FILES, ARG_SNAPSHOT_CONSISTENCY, + ARG_ALLOW_GLOBAL_SEQNO, ARG_CREATE_IF_MISSING, + ARG_ALLOW_BLOCKING_FLUSH, ARG_INGEST_BEHIND, + ARG_WRITE_GLOBAL_SEQNO})), + move_files_(false), + snapshot_consistency_(true), + allow_global_seqno_(true), + allow_blocking_flush_(true), + ingest_behind_(false), + write_global_seqno_(true) { + create_if_missing_ = + IsFlagPresent(flags, ARG_CREATE_IF_MISSING) || + ParseBooleanOption(options, ARG_CREATE_IF_MISSING, false); + move_files_ = IsFlagPresent(flags, ARG_MOVE_FILES) || + ParseBooleanOption(options, ARG_MOVE_FILES, false); + snapshot_consistency_ = + IsFlagPresent(flags, ARG_SNAPSHOT_CONSISTENCY) || + ParseBooleanOption(options, ARG_SNAPSHOT_CONSISTENCY, true); + allow_global_seqno_ = + IsFlagPresent(flags, ARG_ALLOW_GLOBAL_SEQNO) || + ParseBooleanOption(options, ARG_ALLOW_GLOBAL_SEQNO, true); + allow_blocking_flush_ = + IsFlagPresent(flags, ARG_ALLOW_BLOCKING_FLUSH) || + ParseBooleanOption(options, ARG_ALLOW_BLOCKING_FLUSH, true); + ingest_behind_ = IsFlagPresent(flags, ARG_INGEST_BEHIND) || + ParseBooleanOption(options, ARG_INGEST_BEHIND, false); + write_global_seqno_ = + IsFlagPresent(flags, ARG_WRITE_GLOBAL_SEQNO) || + ParseBooleanOption(options, ARG_WRITE_GLOBAL_SEQNO, true); + + if (allow_global_seqno_) { + if (!write_global_seqno_) { + fprintf(stderr, + "Warning: not writing global_seqno to the ingested SST can\n" + "prevent older versions of RocksDB from being able to open it\n"); + } + } else { + if (write_global_seqno_) { + exec_state_ = LDBCommandExecuteResult::Failed( + "ldb cannot write global_seqno to the ingested SST when global_seqno " + "is not allowed"); + } + } + + if (params.size() != 1) { + exec_state_ = + LDBCommandExecuteResult::Failed("input SST path must be specified"); + } else { + input_sst_path_ = params.at(0); + } +} + +void IngestExternalSstFilesCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + if (GetExecuteState().IsFailed()) { + return; + } + ColumnFamilyHandle* cfh = GetCfHandle(); + IngestExternalFileOptions ifo; + ifo.move_files = move_files_; + ifo.snapshot_consistency = snapshot_consistency_; + ifo.allow_global_seqno = allow_global_seqno_; + ifo.allow_blocking_flush = allow_blocking_flush_; + ifo.ingest_behind = ingest_behind_; + ifo.write_global_seqno = write_global_seqno_; + Status status = db_->IngestExternalFile(cfh, {input_sst_path_}, ifo); + if (!status.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "failed to ingest external SST: " + status.ToString()); + } else { + exec_state_ = + LDBCommandExecuteResult::Succeed("external SST files ingested"); + } +} + +void IngestExternalSstFilesCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + options_.create_if_missing = create_if_missing_; +} + +ListFileRangeDeletesCommand::ListFileRangeDeletesCommand( + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_MAX_KEYS})) { + auto itr = options.find(ARG_MAX_KEYS); + if (itr != options.end()) { + try { +#if defined(CYGWIN) + max_keys_ = strtol(itr->second.c_str(), 0, 10); +#else + max_keys_ = std::stoi(itr->second); +#endif + } catch (const std::invalid_argument&) { + exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS + + " has an invalid value"); + } catch (const std::out_of_range&) { + exec_state_ = LDBCommandExecuteResult::Failed( + ARG_MAX_KEYS + " has a value out-of-range"); + } + } +} + +void ListFileRangeDeletesCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(ListFileRangeDeletesCommand::Name()); + ret.append(" [--" + ARG_MAX_KEYS + "=<N>]"); + ret.append(" : print tombstones in SST files.\n"); +} + +void ListFileRangeDeletesCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + + DBImpl* db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB()); + + std::string out_str; + + Status st = + db_impl->TablesRangeTombstoneSummary(GetCfHandle(), max_keys_, &out_str); + if (st.ok()) { + TEST_SYNC_POINT_CALLBACK( + "ListFileRangeDeletesCommand::DoCommand:BeforePrint", &out_str); + fprintf(stdout, "%s\n", out_str.c_str()); + } +} + +void UnsafeRemoveSstFileCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(UnsafeRemoveSstFileCommand::Name()); + ret.append(" <SST file number>"); + ret.append(" "); + ret.append(" MUST NOT be used on a live DB."); + ret.append("\n"); +} + +UnsafeRemoveSstFileCommand::UnsafeRemoveSstFileCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false /* is_read_only */, + BuildCmdLineOptions({})) { + if (params.size() != 1) { + exec_state_ = + LDBCommandExecuteResult::Failed("SST file number must be specified"); + } else { + char* endptr = nullptr; + sst_file_number_ = strtoull(params.at(0).c_str(), &endptr, 10 /* base */); + if (endptr == nullptr || *endptr != '\0') { + exec_state_ = LDBCommandExecuteResult::Failed( + "Failed to parse SST file number " + params.at(0)); + } + } +} + +void UnsafeRemoveSstFileCommand::DoCommand() { + PrepareOptions(); + + OfflineManifestWriter w(options_, db_path_); + if (column_families_.empty()) { + column_families_.emplace_back(kDefaultColumnFamilyName, options_); + } + Status s = w.Recover(column_families_); + + ColumnFamilyData* cfd = nullptr; + int level = -1; + if (s.ok()) { + FileMetaData* metadata = nullptr; + s = w.Versions().GetMetadataForFile(sst_file_number_, &level, &metadata, + &cfd); + } + + if (s.ok()) { + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + edit.DeleteFile(level, sst_file_number_); + std::unique_ptr<FSDirectory> db_dir; + s = options_.env->GetFileSystem()->NewDirectory(db_path_, IOOptions(), + &db_dir, nullptr); + if (s.ok()) { + s = w.LogAndApply(cfd, &edit, db_dir.get()); + } + } + + if (!s.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "failed to unsafely remove SST file: " + s.ToString()); + } else { + exec_state_ = LDBCommandExecuteResult::Succeed("unsafely removed SST file"); + } +} + +const std::string UpdateManifestCommand::ARG_VERBOSE = "verbose"; +const std::string UpdateManifestCommand::ARG_UPDATE_TEMPERATURES = + "update_temperatures"; + +void UpdateManifestCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(UpdateManifestCommand::Name()); + ret.append(" [--update_temperatures]"); + ret.append(" "); + ret.append(" MUST NOT be used on a live DB."); + ret.append("\n"); +} + +UpdateManifestCommand::UpdateManifestCommand( + const std::vector<std::string>& /*params*/, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags) + : LDBCommand(options, flags, false /* is_read_only */, + BuildCmdLineOptions({ARG_VERBOSE, ARG_UPDATE_TEMPERATURES})) { + verbose_ = IsFlagPresent(flags, ARG_VERBOSE) || + ParseBooleanOption(options, ARG_VERBOSE, false); + update_temperatures_ = + IsFlagPresent(flags, ARG_UPDATE_TEMPERATURES) || + ParseBooleanOption(options, ARG_UPDATE_TEMPERATURES, false); + + if (!update_temperatures_) { + exec_state_ = LDBCommandExecuteResult::Failed( + "No action like --update_temperatures specified for update_manifest"); + } +} + +void UpdateManifestCommand::DoCommand() { + PrepareOptions(); + + auto level = verbose_ ? InfoLogLevel::INFO_LEVEL : InfoLogLevel::WARN_LEVEL; + options_.info_log.reset(new StderrLogger(level)); + + experimental::UpdateManifestForFilesStateOptions opts; + opts.update_temperatures = update_temperatures_; + if (column_families_.empty()) { + column_families_.emplace_back(kDefaultColumnFamilyName, options_); + } + Status s = experimental::UpdateManifestForFilesState(options_, db_path_, + column_families_); + + if (!s.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "failed to update manifest: " + s.ToString()); + } else { + exec_state_ = + LDBCommandExecuteResult::Succeed("Manifest updates successful"); + } +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/ldb_cmd_impl.h b/src/rocksdb/tools/ldb_cmd_impl.h new file mode 100644 index 000000000..97de981b1 --- /dev/null +++ b/src/rocksdb/tools/ldb_cmd_impl.h @@ -0,0 +1,744 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <map> +#include <string> +#include <utility> +#include <vector> + +#include "rocksdb/utilities/ldb_cmd.h" + +namespace ROCKSDB_NAMESPACE { + +class CompactorCommand : public LDBCommand { + public: + static std::string Name() { return "compact"; } + + CompactorCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + bool null_from_; + std::string from_; + bool null_to_; + std::string to_; +}; + +class DBFileDumperCommand : public LDBCommand { + public: + static std::string Name() { return "dump_live_files"; } + + DBFileDumperCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + bool decode_blob_index_; + bool dump_uncompressed_blobs_; +}; + +class DBLiveFilesMetadataDumperCommand : public LDBCommand { + public: + static std::string Name() { return "list_live_files_metadata"; } + + DBLiveFilesMetadataDumperCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + bool sort_by_filename_; + + static const std::string ARG_SORT_BY_FILENAME; +}; + +class DBDumperCommand : public LDBCommand { + public: + static std::string Name() { return "dump"; } + + DBDumperCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + /** + * Extract file name from the full path. We handle both the forward slash (/) + * and backslash (\) to make sure that different OS-s are supported. + */ + static std::string GetFileNameFromPath(const std::string& s) { + std::size_t n = s.find_last_of("/\\"); + + if (std::string::npos == n) { + return s; + } else { + return s.substr(n + 1); + } + } + + void DoDumpCommand(); + + bool null_from_; + std::string from_; + bool null_to_; + std::string to_; + int max_keys_; + std::string delim_; + bool count_only_; + bool count_delim_; + bool print_stats_; + std::string path_; + bool decode_blob_index_; + bool dump_uncompressed_blobs_; + + static const std::string ARG_COUNT_ONLY; + static const std::string ARG_COUNT_DELIM; + static const std::string ARG_STATS; + static const std::string ARG_TTL_BUCKET; +}; + +class InternalDumpCommand : public LDBCommand { + public: + static std::string Name() { return "idump"; } + + InternalDumpCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + bool has_from_; + std::string from_; + bool has_to_; + std::string to_; + int max_keys_; + std::string delim_; + bool count_only_; + bool count_delim_; + bool print_stats_; + bool is_input_key_hex_; + bool decode_blob_index_; + + static const std::string ARG_DELIM; + static const std::string ARG_COUNT_ONLY; + static const std::string ARG_COUNT_DELIM; + static const std::string ARG_STATS; + static const std::string ARG_INPUT_KEY_HEX; +}; + +class DBLoaderCommand : public LDBCommand { + public: + static std::string Name() { return "load"; } + + DBLoaderCommand(std::string& db_name, std::vector<std::string>& args); + + DBLoaderCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + void OverrideBaseOptions() override; + + private: + bool disable_wal_; + bool bulk_load_; + bool compact_; + + static const std::string ARG_DISABLE_WAL; + static const std::string ARG_BULK_LOAD; + static const std::string ARG_COMPACT; +}; + +class ManifestDumpCommand : public LDBCommand { + public: + static std::string Name() { return "manifest_dump"; } + + ManifestDumpCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + bool NoDBOpen() override { return true; } + + private: + bool verbose_; + bool json_; + std::string path_; + + static const std::string ARG_VERBOSE; + static const std::string ARG_JSON; + static const std::string ARG_PATH; +}; + +class UpdateManifestCommand : public LDBCommand { + public: + static std::string Name() { return "update_manifest"; } + + UpdateManifestCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + virtual void DoCommand() override; + + virtual bool NoDBOpen() override { return true; } + + private: + bool verbose_; + bool update_temperatures_; + // TODO future: checksum_func for populating checksums + + static const std::string ARG_VERBOSE; + static const std::string ARG_UPDATE_TEMPERATURES; +}; + +class FileChecksumDumpCommand : public LDBCommand { + public: + static std::string Name() { return "file_checksum_dump"; } + + FileChecksumDumpCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + void DoCommand() override; + + bool NoDBOpen() override { return true; } + + private: + std::string path_; + bool is_checksum_hex_; + + static const std::string ARG_PATH; +}; + +class GetPropertyCommand : public LDBCommand { + public: + static std::string Name() { return "get_property"; } + + GetPropertyCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + void DoCommand() override; + + private: + std::string property_; +}; + +class ListColumnFamiliesCommand : public LDBCommand { + public: + static std::string Name() { return "list_column_families"; } + + ListColumnFamiliesCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + bool NoDBOpen() override { return true; } +}; + +class CreateColumnFamilyCommand : public LDBCommand { + public: + static std::string Name() { return "create_column_family"; } + + CreateColumnFamilyCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + bool NoDBOpen() override { return false; } + + private: + std::string new_cf_name_; +}; + +class DropColumnFamilyCommand : public LDBCommand { + public: + static std::string Name() { return "drop_column_family"; } + + DropColumnFamilyCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + bool NoDBOpen() override { return false; } + + private: + std::string cf_name_to_drop_; +}; + +class ReduceDBLevelsCommand : public LDBCommand { + public: + static std::string Name() { return "reduce_levels"; } + + ReduceDBLevelsCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) override; + + void DoCommand() override; + + bool NoDBOpen() override { return true; } + + static void Help(std::string& msg); + + static std::vector<std::string> PrepareArgs(const std::string& db_path, + int new_levels, + bool print_old_level = false); + + private: + int old_levels_; + int new_levels_; + bool print_old_levels_; + + static const std::string ARG_NEW_LEVELS; + static const std::string ARG_PRINT_OLD_LEVELS; + + Status GetOldNumOfLevels(Options& opt, int* levels); +}; + +class ChangeCompactionStyleCommand : public LDBCommand { + public: + static std::string Name() { return "change_compaction_style"; } + + ChangeCompactionStyleCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) override; + + void DoCommand() override; + + static void Help(std::string& msg); + + private: + int old_compaction_style_; + int new_compaction_style_; + + static const std::string ARG_OLD_COMPACTION_STYLE; + static const std::string ARG_NEW_COMPACTION_STYLE; +}; + +class WALDumperCommand : public LDBCommand { + public: + static std::string Name() { return "dump_wal"; } + + WALDumperCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + bool NoDBOpen() override { return true; } + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + bool print_header_; + std::string wal_file_; + bool print_values_; + bool is_write_committed_; // default will be set to true + + static const std::string ARG_WAL_FILE; + static const std::string ARG_WRITE_COMMITTED; + static const std::string ARG_PRINT_HEADER; + static const std::string ARG_PRINT_VALUE; +}; + +class GetCommand : public LDBCommand { + public: + static std::string Name() { return "get"; } + + GetCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::string key_; +}; + +class ApproxSizeCommand : public LDBCommand { + public: + static std::string Name() { return "approxsize"; } + + ApproxSizeCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::string start_key_; + std::string end_key_; +}; + +class BatchPutCommand : public LDBCommand { + public: + static std::string Name() { return "batchput"; } + + BatchPutCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + void OverrideBaseOptions() override; + + private: + /** + * The key-values to be inserted. + */ + std::vector<std::pair<std::string, std::string>> key_values_; +}; + +class ScanCommand : public LDBCommand { + public: + static std::string Name() { return "scan"; } + + ScanCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::string start_key_; + std::string end_key_; + bool start_key_specified_; + bool end_key_specified_; + int max_keys_scanned_; + bool no_value_; +}; + +class DeleteCommand : public LDBCommand { + public: + static std::string Name() { return "delete"; } + + DeleteCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::string key_; +}; + +class SingleDeleteCommand : public LDBCommand { + public: + static std::string Name() { return "singledelete"; } + + SingleDeleteCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::string key_; +}; + +class DeleteRangeCommand : public LDBCommand { + public: + static std::string Name() { return "deleterange"; } + + DeleteRangeCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::string begin_key_; + std::string end_key_; +}; + +class PutCommand : public LDBCommand { + public: + static std::string Name() { return "put"; } + + PutCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + void OverrideBaseOptions() override; + + private: + std::string key_; + std::string value_; +}; + +/** + * Command that starts up a REPL shell that allows + * get/put/delete. + */ +class DBQuerierCommand : public LDBCommand { + public: + static std::string Name() { return "query"; } + + DBQuerierCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + private: + static const char* HELP_CMD; + static const char* GET_CMD; + static const char* PUT_CMD; + static const char* DELETE_CMD; +}; + +class CheckConsistencyCommand : public LDBCommand { + public: + static std::string Name() { return "checkconsistency"; } + + CheckConsistencyCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + bool NoDBOpen() override { return true; } + + static void Help(std::string& ret); +}; + +class CheckPointCommand : public LDBCommand { + public: + static std::string Name() { return "checkpoint"; } + + CheckPointCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + std::string checkpoint_dir_; + + private: + static const std::string ARG_CHECKPOINT_DIR; +}; + +class RepairCommand : public LDBCommand { + public: + static std::string Name() { return "repair"; } + + RepairCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + bool NoDBOpen() override { return true; } + + void OverrideBaseOptions() override; + + static void Help(std::string& ret); + + protected: + bool verbose_; + + private: + static const std::string ARG_VERBOSE; +}; + +class BackupEngineCommand : public LDBCommand { + public: + BackupEngineCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + protected: + static void Help(const std::string& name, std::string& ret); + std::string backup_env_uri_; + std::string backup_fs_uri_; + std::string backup_dir_; + int num_threads_; + std::unique_ptr<Logger> logger_; + std::shared_ptr<Env> backup_env_guard_; + + private: + static const std::string ARG_BACKUP_DIR; + static const std::string ARG_BACKUP_ENV_URI; + static const std::string ARG_BACKUP_FS_URI; + static const std::string ARG_NUM_THREADS; + static const std::string ARG_STDERR_LOG_LEVEL; +}; + +class BackupCommand : public BackupEngineCommand { + public: + static std::string Name() { return "backup"; } + BackupCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + void DoCommand() override; + static void Help(std::string& ret); +}; + +class RestoreCommand : public BackupEngineCommand { + public: + static std::string Name() { return "restore"; } + RestoreCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + void DoCommand() override; + bool NoDBOpen() override { return true; } + static void Help(std::string& ret); +}; + +class WriteExternalSstFilesCommand : public LDBCommand { + public: + static std::string Name() { return "write_extern_sst"; } + WriteExternalSstFilesCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + bool NoDBOpen() override { return false; } + + void OverrideBaseOptions() override; + + static void Help(std::string& ret); + + private: + std::string output_sst_path_; +}; + +class IngestExternalSstFilesCommand : public LDBCommand { + public: + static std::string Name() { return "ingest_extern_sst"; } + IngestExternalSstFilesCommand( + const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + bool NoDBOpen() override { return false; } + + void OverrideBaseOptions() override; + + static void Help(std::string& ret); + + private: + std::string input_sst_path_; + bool move_files_; + bool snapshot_consistency_; + bool allow_global_seqno_; + bool allow_blocking_flush_; + bool ingest_behind_; + bool write_global_seqno_; + + static const std::string ARG_MOVE_FILES; + static const std::string ARG_SNAPSHOT_CONSISTENCY; + static const std::string ARG_ALLOW_GLOBAL_SEQNO; + static const std::string ARG_ALLOW_BLOCKING_FLUSH; + static const std::string ARG_INGEST_BEHIND; + static const std::string ARG_WRITE_GLOBAL_SEQNO; +}; + +// Command that prints out range delete tombstones in SST files. +class ListFileRangeDeletesCommand : public LDBCommand { + public: + static std::string Name() { return "list_file_range_deletes"; } + + ListFileRangeDeletesCommand(const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + int max_keys_ = 1000; +}; + +// Command that removes the SST file forcibly from the manifest. +class UnsafeRemoveSstFileCommand : public LDBCommand { + public: + static std::string Name() { return "unsafe_remove_sst_file"; } + + UnsafeRemoveSstFileCommand(const std::vector<std::string>& params, + const std::map<std::string, std::string>& options, + const std::vector<std::string>& flags); + + static void Help(std::string& ret); + + void DoCommand() override; + + bool NoDBOpen() override { return true; } + + private: + uint64_t sst_file_number_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/tools/ldb_cmd_test.cc b/src/rocksdb/tools/ldb_cmd_test.cc new file mode 100644 index 000000000..5d83a6cd9 --- /dev/null +++ b/src/rocksdb/tools/ldb_cmd_test.cc @@ -0,0 +1,1226 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE +#include "rocksdb/utilities/ldb_cmd.h" + +#include <cinttypes> + +#include "db/db_test_util.h" +#include "db/version_edit.h" +#include "db/version_set.h" +#include "env/composite_env_wrapper.h" +#include "file/filename.h" +#include "port/stack_trace.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/file_system.h" +#include "rocksdb/utilities/options_util.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/file_checksum_helper.h" +#include "util/random.h" + +using std::map; +using std::string; +using std::vector; + +namespace ROCKSDB_NAMESPACE { + +class LdbCmdTest : public testing::Test { + public: + LdbCmdTest() : testing::Test() {} + + Env* TryLoadCustomOrDefaultEnv() { + Env* env = Env::Default(); + EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env, &env_guard_)); + return env; + } + + private: + std::shared_ptr<Env> env_guard_; +}; + +TEST_F(LdbCmdTest, HelpAndVersion) { + Options o; + o.env = TryLoadCustomOrDefaultEnv(); + LDBOptions lo; + static const char* help[] = {"./ldb", "--help"}; + ASSERT_EQ(0, LDBCommandRunner::RunCommand(2, help, o, lo, nullptr)); + static const char* version[] = {"./ldb", "--version"}; + ASSERT_EQ(0, LDBCommandRunner::RunCommand(2, version, o, lo, nullptr)); + static const char* bad[] = {"./ldb", "--not_an_option"}; + ASSERT_NE(0, LDBCommandRunner::RunCommand(2, bad, o, lo, nullptr)); +} + +TEST_F(LdbCmdTest, HexToString) { + // map input to expected outputs. + // odd number of "hex" half bytes doesn't make sense + map<string, vector<int>> inputMap = { + {"0x07", {7}}, {"0x5050", {80, 80}}, {"0xFF", {-1}}, + {"0x1234", {18, 52}}, {"0xaaAbAC", {-86, -85, -84}}, {"0x1203", {18, 3}}, + }; + + for (const auto& inPair : inputMap) { + auto actual = ROCKSDB_NAMESPACE::LDBCommand::HexToString(inPair.first); + auto expected = inPair.second; + for (unsigned int i = 0; i < actual.length(); i++) { + EXPECT_EQ(expected[i], static_cast<int>((signed char)actual[i])); + } + auto reverse = ROCKSDB_NAMESPACE::LDBCommand::StringToHex(actual); + EXPECT_STRCASEEQ(inPair.first.c_str(), reverse.c_str()); + } +} + +TEST_F(LdbCmdTest, HexToStringBadInputs) { + const vector<string> badInputs = { + "0xZZ", "123", "0xx5", "0x111G", "0x123", "Ox12", "0xT", "0x1Q1", + }; + for (const auto& badInput : badInputs) { + try { + ROCKSDB_NAMESPACE::LDBCommand::HexToString(badInput); + std::cerr << "Should fail on bad hex value: " << badInput << "\n"; + FAIL(); + } catch (...) { + } + } +} + +TEST_F(LdbCmdTest, MemEnv) { + Env* base_env = TryLoadCustomOrDefaultEnv(); + std::unique_ptr<Env> env(NewMemEnv(base_env)); + Options opts; + opts.env = env.get(); + opts.create_if_missing = true; + + DB* db = nullptr; + std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test"); + ASSERT_OK(DB::Open(opts, dbname, &db)); + + WriteOptions wopts; + for (int i = 0; i < 100; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%08d", i); + ASSERT_OK(db->Put(wopts, buf, buf)); + } + FlushOptions fopts; + fopts.wait = true; + ASSERT_OK(db->Flush(fopts)); + + delete db; + + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str()); + char arg3[] = "dump_live_files"; + char* argv[] = {arg1, arg2, arg3}; + + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr)); +} + +class FileChecksumTestHelper { + private: + Options options_; + DB* db_; + std::string dbname_; + + Status VerifyChecksum(LiveFileMetaData& file_meta) { + std::string cur_checksum; + std::string checksum_func_name; + + Status s; + EnvOptions soptions; + std::unique_ptr<SequentialFile> file_reader; + std::string file_path = dbname_ + "/" + file_meta.name; + s = options_.env->NewSequentialFile(file_path, &file_reader, soptions); + if (!s.ok()) { + return s; + } + std::unique_ptr<char[]> scratch(new char[2048]); + Slice result; + FileChecksumGenFactory* file_checksum_gen_factory = + options_.file_checksum_gen_factory.get(); + if (file_checksum_gen_factory == nullptr) { + cur_checksum = kUnknownFileChecksum; + checksum_func_name = kUnknownFileChecksumFuncName; + } else { + FileChecksumGenContext gen_context; + gen_context.file_name = file_meta.name; + std::unique_ptr<FileChecksumGenerator> file_checksum_gen = + file_checksum_gen_factory->CreateFileChecksumGenerator(gen_context); + checksum_func_name = file_checksum_gen->Name(); + s = file_reader->Read(2048, &result, scratch.get()); + if (!s.ok()) { + return s; + } + while (result.size() != 0) { + file_checksum_gen->Update(scratch.get(), result.size()); + s = file_reader->Read(2048, &result, scratch.get()); + if (!s.ok()) { + return s; + } + } + file_checksum_gen->Finalize(); + cur_checksum = file_checksum_gen->GetChecksum(); + } + + std::string stored_checksum = file_meta.file_checksum; + std::string stored_checksum_func_name = file_meta.file_checksum_func_name; + if ((cur_checksum != stored_checksum) || + (checksum_func_name != stored_checksum_func_name)) { + return Status::Corruption( + "Checksum does not match! The file: " + file_meta.name + + ", checksum name: " + stored_checksum_func_name + " and checksum " + + stored_checksum + ". However, expected checksum name: " + + checksum_func_name + " and checksum " + cur_checksum); + } + return Status::OK(); + } + + public: + FileChecksumTestHelper(Options& options, DB* db, std::string db_name) + : options_(options), db_(db), dbname_(db_name) {} + ~FileChecksumTestHelper() {} + + // Verify the checksum information in Manifest. + Status VerifyChecksumInManifest( + const std::vector<LiveFileMetaData>& live_files) { + // Step 1: verify if the dbname_ is correct + if (dbname_.back() != '/') { + dbname_.append("/"); + } + + // Step 2, get the the checksum information by recovering the VersionSet + // from Manifest. + std::unique_ptr<FileChecksumList> checksum_list(NewFileChecksumList()); + EnvOptions sopt; + std::shared_ptr<Cache> tc(NewLRUCache(options_.max_open_files - 10, + options_.table_cache_numshardbits)); + options_.db_paths.emplace_back(dbname_, 0); + options_.num_levels = 64; + WriteController wc(options_.delayed_write_rate); + WriteBufferManager wb(options_.db_write_buffer_size); + ImmutableDBOptions immutable_db_options(options_); + VersionSet versions(dbname_, &immutable_db_options, sopt, tc.get(), &wb, + &wc, nullptr, nullptr, "", ""); + std::vector<std::string> cf_name_list; + Status s; + s = versions.ListColumnFamilies(&cf_name_list, dbname_, + immutable_db_options.fs.get()); + if (s.ok()) { + std::vector<ColumnFamilyDescriptor> cf_list; + for (const auto& name : cf_name_list) { + fprintf(stdout, "cf_name: %s", name.c_str()); + cf_list.emplace_back(name, ColumnFamilyOptions(options_)); + } + s = versions.Recover(cf_list, true); + } + if (s.ok()) { + s = versions.GetLiveFilesChecksumInfo(checksum_list.get()); + } + if (!s.ok()) { + return s; + } + + // Step 3 verify the checksum + if (live_files.size() != checksum_list->size()) { + return Status::Corruption("The number of files does not match!"); + } + for (size_t i = 0; i < live_files.size(); i++) { + std::string stored_checksum = ""; + std::string stored_func_name = ""; + s = checksum_list->SearchOneFileChecksum( + live_files[i].file_number, &stored_checksum, &stored_func_name); + if (s.IsNotFound()) { + return s; + } + if (live_files[i].file_checksum != stored_checksum || + live_files[i].file_checksum_func_name != stored_func_name) { + return Status::Corruption( + "Checksum does not match! The file: " + + std::to_string(live_files[i].file_number) + + ". In Manifest, checksum name: " + stored_func_name + + " and checksum " + stored_checksum + + ". However, expected checksum name: " + + live_files[i].file_checksum_func_name + " and checksum " + + live_files[i].file_checksum); + } + } + return Status::OK(); + } + + // Verify the checksum of each file by recalculting the checksum and + // comparing it with the one being generated when a SST file is created. + Status VerifyEachFileChecksum() { + assert(db_ != nullptr); + EXPECT_OK(db_->DisableFileDeletions()); + std::vector<LiveFileMetaData> live_files; + db_->GetLiveFilesMetaData(&live_files); + Status cs; + for (auto a_file : live_files) { + cs = VerifyChecksum(a_file); + if (!cs.ok()) { + break; + } + } + EXPECT_OK(db_->EnableFileDeletions()); + return cs; + } +}; + +TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) { + Env* base_env = TryLoadCustomOrDefaultEnv(); + std::unique_ptr<Env> env(NewMemEnv(base_env)); + Options opts; + opts.env = env.get(); + opts.create_if_missing = true; + + DB* db = nullptr; + std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test"); + ASSERT_OK(DB::Open(opts, dbname, &db)); + + WriteOptions wopts; + FlushOptions fopts; + fopts.wait = true; + Random rnd(test::RandomSeed()); + for (int i = 0; i < 200; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%08d", i); + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, buf, v)); + } + ASSERT_OK(db->Flush(fopts)); + for (int i = 100; i < 300; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%08d", i); + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, buf, v)); + } + ASSERT_OK(db->Flush(fopts)); + for (int i = 200; i < 400; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%08d", i); + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, buf, v)); + } + ASSERT_OK(db->Flush(fopts)); + for (int i = 300; i < 400; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%08d", i); + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, buf, v)); + } + ASSERT_OK(db->Flush(fopts)); + ASSERT_OK(db->Close()); + delete db; + + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str()); + char arg3[] = "file_checksum_dump"; + char arg4[] = "--hex"; + char* argv[] = {arg1, arg2, arg3, arg4}; + + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + + ASSERT_OK(DB::Open(opts, dbname, &db)); + + // Verify each sst file checksum value and checksum name + FileChecksumTestHelper fct_helper(opts, db, dbname); + ASSERT_OK(fct_helper.VerifyEachFileChecksum()); + + // Manually trigger compaction + char b_buf[16]; + snprintf(b_buf, sizeof(b_buf), "%08d", 0); + char e_buf[16]; + snprintf(e_buf, sizeof(e_buf), "%08d", 399); + Slice begin(b_buf); + Slice end(e_buf); + CompactRangeOptions options; + ASSERT_OK(db->CompactRange(options, &begin, &end)); + // Verify each sst file checksum after compaction + FileChecksumTestHelper fct_helper_ac(opts, db, dbname); + ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum()); + + ASSERT_OK(db->Close()); + delete db; + + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + + ASSERT_OK(DB::Open(opts, dbname, &db)); + + // Verify the checksum information in memory is the same as that in Manifest; + std::vector<LiveFileMetaData> live_files; + db->GetLiveFilesMetaData(&live_files); + delete db; + ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files)); +} + +TEST_F(LdbCmdTest, BlobDBDumpFileChecksumNoChecksum) { + Env* base_env = TryLoadCustomOrDefaultEnv(); + std::unique_ptr<Env> env(NewMemEnv(base_env)); + Options opts; + opts.env = env.get(); + opts.create_if_missing = true; + opts.enable_blob_files = true; + + DB* db = nullptr; + std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test"); + ASSERT_OK(DB::Open(opts, dbname, &db)); + + WriteOptions wopts; + FlushOptions fopts; + fopts.wait = true; + Random rnd(test::RandomSeed()); + for (int i = 0; i < 200; i++) { + std::ostringstream oss; + oss << std::setfill('0') << std::setw(8) << std::fixed << i; + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, oss.str(), v)); + } + ASSERT_OK(db->Flush(fopts)); + for (int i = 100; i < 300; i++) { + std::ostringstream oss; + oss << std::setfill('0') << std::setw(8) << std::fixed << i; + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, oss.str(), v)); + } + ASSERT_OK(db->Flush(fopts)); + for (int i = 200; i < 400; i++) { + std::ostringstream oss; + oss << std::setfill('0') << std::setw(8) << std::fixed << i; + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, oss.str(), v)); + } + ASSERT_OK(db->Flush(fopts)); + for (int i = 300; i < 400; i++) { + std::ostringstream oss; + oss << std::setfill('0') << std::setw(8) << std::fixed << i; + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, oss.str(), v)); + } + ASSERT_OK(db->Flush(fopts)); + ASSERT_OK(db->Close()); + delete db; + + char arg1[] = "./ldb"; + std::string arg2_str = "--db=" + dbname; + char arg3[] = "file_checksum_dump"; + char arg4[] = "--hex"; + char* argv[] = {arg1, const_cast<char*>(arg2_str.c_str()), arg3, arg4}; + + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + + ASSERT_OK(DB::Open(opts, dbname, &db)); + + // Verify each sst and blob file checksum value and checksum name + FileChecksumTestHelper fct_helper(opts, db, dbname); + ASSERT_OK(fct_helper.VerifyEachFileChecksum()); + + // Manually trigger compaction + std::ostringstream oss_b_buf; + oss_b_buf << std::setfill('0') << std::setw(8) << std::fixed << 0; + std::ostringstream oss_e_buf; + oss_e_buf << std::setfill('0') << std::setw(8) << std::fixed << 399; + std::string b_buf = oss_b_buf.str(); + std::string e_buf = oss_e_buf.str(); + Slice begin(b_buf); + Slice end(e_buf); + + CompactRangeOptions options; + ASSERT_OK(db->CompactRange(options, &begin, &end)); + // Verify each sst file checksum after compaction + FileChecksumTestHelper fct_helper_ac(opts, db, dbname); + ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum()); + + ASSERT_OK(db->Close()); + delete db; + + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); +} + +TEST_F(LdbCmdTest, DumpFileChecksumCRC32) { + Env* base_env = TryLoadCustomOrDefaultEnv(); + std::unique_ptr<Env> env(NewMemEnv(base_env)); + Options opts; + opts.env = env.get(); + opts.create_if_missing = true; + opts.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + + DB* db = nullptr; + std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test"); + ASSERT_OK(DB::Open(opts, dbname, &db)); + + WriteOptions wopts; + FlushOptions fopts; + fopts.wait = true; + Random rnd(test::RandomSeed()); + for (int i = 0; i < 100; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%08d", i); + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, buf, v)); + } + ASSERT_OK(db->Flush(fopts)); + for (int i = 50; i < 150; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%08d", i); + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, buf, v)); + } + ASSERT_OK(db->Flush(fopts)); + for (int i = 100; i < 200; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%08d", i); + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, buf, v)); + } + ASSERT_OK(db->Flush(fopts)); + for (int i = 150; i < 250; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%08d", i); + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, buf, v)); + } + ASSERT_OK(db->Flush(fopts)); + ASSERT_OK(db->Close()); + delete db; + + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str()); + char arg3[] = "file_checksum_dump"; + char arg4[] = "--hex"; + char* argv[] = {arg1, arg2, arg3, arg4}; + + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + + ASSERT_OK(DB::Open(opts, dbname, &db)); + + // Verify each sst file checksum value and checksum name + FileChecksumTestHelper fct_helper(opts, db, dbname); + ASSERT_OK(fct_helper.VerifyEachFileChecksum()); + + // Manually trigger compaction + char b_buf[16]; + snprintf(b_buf, sizeof(b_buf), "%08d", 0); + char e_buf[16]; + snprintf(e_buf, sizeof(e_buf), "%08d", 249); + Slice begin(b_buf); + Slice end(e_buf); + CompactRangeOptions options; + ASSERT_OK(db->CompactRange(options, &begin, &end)); + // Verify each sst file checksum after compaction + FileChecksumTestHelper fct_helper_ac(opts, db, dbname); + ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum()); + + ASSERT_OK(db->Close()); + delete db; + + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + + ASSERT_OK(DB::Open(opts, dbname, &db)); + + // Verify the checksum information in memory is the same as that in Manifest; + std::vector<LiveFileMetaData> live_files; + db->GetLiveFilesMetaData(&live_files); + ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files)); + + ASSERT_OK(db->Close()); + delete db; +} + +TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) { + Env* base_env = TryLoadCustomOrDefaultEnv(); + std::unique_ptr<Env> env(NewMemEnv(base_env)); + Options opts; + opts.env = env.get(); + opts.create_if_missing = true; + opts.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + opts.enable_blob_files = true; + + DB* db = nullptr; + std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test"); + ASSERT_OK(DB::Open(opts, dbname, &db)); + + WriteOptions wopts; + FlushOptions fopts; + fopts.wait = true; + Random rnd(test::RandomSeed()); + for (int i = 0; i < 100; i++) { + std::ostringstream oss; + oss << std::setfill('0') << std::setw(8) << std::fixed << i; + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, oss.str(), v)); + } + ASSERT_OK(db->Flush(fopts)); + for (int i = 50; i < 150; i++) { + std::ostringstream oss; + oss << std::setfill('0') << std::setw(8) << std::fixed << i; + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, oss.str(), v)); + } + ASSERT_OK(db->Flush(fopts)); + for (int i = 100; i < 200; i++) { + std::ostringstream oss; + oss << std::setfill('0') << std::setw(8) << std::fixed << i; + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, oss.str(), v)); + } + ASSERT_OK(db->Flush(fopts)); + for (int i = 150; i < 250; i++) { + std::ostringstream oss; + oss << std::setfill('0') << std::setw(8) << std::fixed << i; + std::string v = rnd.RandomString(100); + ASSERT_OK(db->Put(wopts, oss.str(), v)); + } + ASSERT_OK(db->Flush(fopts)); + ASSERT_OK(db->Close()); + delete db; + + char arg1[] = "./ldb"; + std::string arg2_str = "--db=" + dbname; + char arg3[] = "file_checksum_dump"; + char arg4[] = "--hex"; + char* argv[] = {arg1, const_cast<char*>(arg2_str.c_str()), arg3, arg4}; + + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + + ASSERT_OK(DB::Open(opts, dbname, &db)); + + // Verify each sst and blob file checksum value and checksum name + FileChecksumTestHelper fct_helper(opts, db, dbname); + ASSERT_OK(fct_helper.VerifyEachFileChecksum()); + + // Manually trigger compaction + std::ostringstream oss_b_buf; + oss_b_buf << std::setfill('0') << std::setw(8) << std::fixed << 0; + std::ostringstream oss_e_buf; + oss_e_buf << std::setfill('0') << std::setw(8) << std::fixed << 249; + std::string b_buf = oss_b_buf.str(); + std::string e_buf = oss_e_buf.str(); + Slice begin(b_buf); + Slice end(e_buf); + + CompactRangeOptions options; + ASSERT_OK(db->CompactRange(options, &begin, &end)); + // Verify each sst file checksum after compaction + FileChecksumTestHelper fct_helper_ac(opts, db, dbname); + ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum()); + + ASSERT_OK(db->Close()); + delete db; + + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); +} + +TEST_F(LdbCmdTest, OptionParsing) { + // test parsing flags + Options opts; + opts.env = TryLoadCustomOrDefaultEnv(); + { + std::vector<std::string> args; + args.push_back("scan"); + args.push_back("--ttl"); + args.push_back("--timestamp"); + LDBCommand* command = ROCKSDB_NAMESPACE::LDBCommand::InitFromCmdLineArgs( + args, opts, LDBOptions(), nullptr); + const std::vector<std::string> flags = command->TEST_GetFlags(); + EXPECT_EQ(flags.size(), 2); + EXPECT_EQ(flags[0], "ttl"); + EXPECT_EQ(flags[1], "timestamp"); + delete command; + } + // test parsing options which contains equal sign in the option value + { + std::vector<std::string> args; + args.push_back("scan"); + args.push_back("--db=/dev/shm/ldbtest/"); + args.push_back( + "--from='abcd/efg/hijk/lmn/" + "opq:__rst.uvw.xyz?a=3+4+bcd+efghi&jk=lm_no&pq=rst-0&uv=wx-8&yz=a&bcd_" + "ef=gh.ijk'"); + LDBCommand* command = ROCKSDB_NAMESPACE::LDBCommand::InitFromCmdLineArgs( + args, opts, LDBOptions(), nullptr); + const std::map<std::string, std::string> option_map = + command->TEST_GetOptionMap(); + EXPECT_EQ(option_map.at("db"), "/dev/shm/ldbtest/"); + EXPECT_EQ(option_map.at("from"), + "'abcd/efg/hijk/lmn/" + "opq:__rst.uvw.xyz?a=3+4+bcd+efghi&jk=lm_no&pq=rst-0&uv=wx-8&yz=" + "a&bcd_ef=gh.ijk'"); + delete command; + } +} + +TEST_F(LdbCmdTest, ListFileTombstone) { + Env* base_env = TryLoadCustomOrDefaultEnv(); + std::unique_ptr<Env> env(NewMemEnv(base_env)); + Options opts; + opts.env = env.get(); + opts.create_if_missing = true; + + DB* db = nullptr; + std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test"); + ASSERT_OK(DB::Open(opts, dbname, &db)); + + WriteOptions wopts; + ASSERT_OK(db->Put(wopts, "foo", "1")); + ASSERT_OK(db->Put(wopts, "bar", "2")); + + FlushOptions fopts; + fopts.wait = true; + ASSERT_OK(db->Flush(fopts)); + + ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "foo", "foo2")); + ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "bar", "foo2")); + ASSERT_OK(db->Flush(fopts)); + + delete db; + + { + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str()); + char arg3[] = "list_file_range_deletes"; + char* argv[] = {arg1, arg2, arg3}; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) { + std::string* out_str = reinterpret_cast<std::string*>(arg); + + // Count number of tombstones printed + int num_tb = 0; + const std::string kFingerprintStr = "start: "; + auto offset = out_str->find(kFingerprintStr); + while (offset != std::string::npos) { + num_tb++; + offset = + out_str->find(kFingerprintStr, offset + kFingerprintStr.size()); + } + EXPECT_EQ(2, num_tb); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_EQ( + 0, LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + + // Test the case of limiting tombstones + { + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str()); + char arg3[] = "list_file_range_deletes"; + char arg4[] = "--max_keys=1"; + char* argv[] = {arg1, arg2, arg3, arg4}; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) { + std::string* out_str = reinterpret_cast<std::string*>(arg); + + // Count number of tombstones printed + int num_tb = 0; + const std::string kFingerprintStr = "start: "; + auto offset = out_str->find(kFingerprintStr); + while (offset != std::string::npos) { + num_tb++; + offset = + out_str->find(kFingerprintStr, offset + kFingerprintStr.size()); + } + EXPECT_EQ(1, num_tb); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_EQ( + 0, LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(LdbCmdTest, DisableConsistencyChecks) { + Env* base_env = TryLoadCustomOrDefaultEnv(); + std::unique_ptr<Env> env(NewMemEnv(base_env)); + Options opts; + opts.env = env.get(); + opts.create_if_missing = true; + + std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test"); + + { + DB* db = nullptr; + ASSERT_OK(DB::Open(opts, dbname, &db)); + + WriteOptions wopts; + FlushOptions fopts; + fopts.wait = true; + + ASSERT_OK(db->Put(wopts, "foo1", "1")); + ASSERT_OK(db->Put(wopts, "bar1", "2")); + ASSERT_OK(db->Flush(fopts)); + + ASSERT_OK(db->Put(wopts, "foo2", "3")); + ASSERT_OK(db->Put(wopts, "bar2", "4")); + ASSERT_OK(db->Flush(fopts)); + + delete db; + } + + { + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str()); + char arg3[] = "checkconsistency"; + char* argv[] = {arg1, arg2, arg3}; + + SyncPoint::GetInstance()->SetCallBack( + "Version::PrepareAppend:forced_check", [&](void* arg) { + bool* forced = reinterpret_cast<bool*>(arg); + ASSERT_TRUE(*forced); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_EQ( + 0, LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr)); + + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + } + { + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str()); + char arg3[] = "scan"; + char* argv[] = {arg1, arg2, arg3}; + + SyncPoint::GetInstance()->SetCallBack( + "Version::PrepareAppend:forced_check", [&](void* arg) { + bool* forced = reinterpret_cast<bool*>(arg); + ASSERT_TRUE(*forced); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_EQ( + 0, LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr)); + + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + } + { + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str()); + char arg3[] = "scan"; + char arg4[] = "--disable_consistency_checks"; + char* argv[] = {arg1, arg2, arg3, arg4}; + + SyncPoint::GetInstance()->SetCallBack( + "ColumnFamilyData::ColumnFamilyData", [&](void* arg) { + ColumnFamilyOptions* cfo = + reinterpret_cast<ColumnFamilyOptions*>(arg); + ASSERT_FALSE(cfo->force_consistency_checks); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_EQ( + 0, LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(LdbCmdTest, TestBadDbPath) { + Env* base_env = TryLoadCustomOrDefaultEnv(); + std::unique_ptr<Env> env(NewMemEnv(base_env)); + Options opts; + opts.env = env.get(); + opts.create_if_missing = true; + + std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test"); + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s/.no_such_dir", dbname.c_str()); + char arg3[1024]; + snprintf(arg3, sizeof(arg3), "create_column_family"); + char arg4[] = "bad cf"; + char* argv[] = {arg1, arg2, arg3, arg4}; + + ASSERT_EQ(1, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + snprintf(arg3, sizeof(arg3), "drop_column_family"); + ASSERT_EQ(1, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); +} +namespace { +class WrappedEnv : public EnvWrapper { + public: + explicit WrappedEnv(Env* t) : EnvWrapper(t) {} + static const char* kClassName() { return "WrappedEnv"; } + const char* Name() const override { return kClassName(); } +}; +} // namespace +TEST_F(LdbCmdTest, LoadCFOptionsAndOverride) { + // Env* base_env = TryLoadCustomOrDefaultEnv(); + // std::unique_ptr<Env> env(NewMemEnv(base_env)); + std::unique_ptr<Env> env(new WrappedEnv(Env::Default())); + Options opts; + opts.env = env.get(); + opts.create_if_missing = true; + + DB* db = nullptr; + std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test"); + ASSERT_OK(DestroyDB(dbname, opts)); + ASSERT_OK(DB::Open(opts, dbname, &db)); + + ColumnFamilyHandle* cf_handle; + ColumnFamilyOptions cf_opts; + cf_opts.num_levels = 20; + ASSERT_OK(db->CreateColumnFamily(cf_opts, "cf1", &cf_handle)); + + delete cf_handle; + delete db; + + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str()); + char arg3[] = "put"; + char arg4[] = "key1"; + char arg5[] = "value1"; + char arg6[] = "--try_load_options"; + char arg7[] = "--column_family=cf1"; + char arg8[] = "--write_buffer_size=268435456"; + char* argv[] = {arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8}; + + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(8, argv, opts, LDBOptions(), nullptr)); + + ConfigOptions config_opts; + Options options; + std::vector<ColumnFamilyDescriptor> column_families; + config_opts.env = env.get(); + ASSERT_OK(LoadLatestOptions(config_opts, dbname, &options, &column_families)); + ASSERT_EQ(column_families.size(), 2); + ASSERT_EQ(options.num_levels, opts.num_levels); + ASSERT_EQ(column_families[1].options.num_levels, cf_opts.num_levels); + ASSERT_EQ(column_families[1].options.write_buffer_size, 268435456); +} + +TEST_F(LdbCmdTest, UnsafeRemoveSstFile) { + Options opts; + opts.level0_file_num_compaction_trigger = 10; + opts.create_if_missing = true; + + DB* db = nullptr; + std::string dbname = test::PerThreadDBPath(Env::Default(), "ldb_cmd_test"); + ASSERT_OK(DestroyDB(dbname, opts)); + ASSERT_OK(DB::Open(opts, dbname, &db)); + + // Create three SST files + for (size_t i = 0; i < 3; ++i) { + ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), std::to_string(i))); + ASSERT_OK(db->Flush(FlushOptions())); + } + + // Determine which is the "middle" one + std::vector<LiveFileMetaData> sst_files; + db->GetLiveFilesMetaData(&sst_files); + + std::vector<uint64_t> numbers; + for (auto& f : sst_files) { + numbers.push_back(f.file_number); + } + ASSERT_EQ(numbers.size(), 3); + std::sort(numbers.begin(), numbers.end()); + uint64_t to_remove = numbers[1]; + + // Close for unsafe_remove_sst_file + delete db; + db = nullptr; + + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str()); + char arg3[] = "unsafe_remove_sst_file"; + char arg4[20]; + snprintf(arg4, sizeof(arg4), "%" PRIu64, to_remove); + char* argv[] = {arg1, arg2, arg3, arg4}; + + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + + // Re-open, and verify with Get that middle file is gone + ASSERT_OK(DB::Open(opts, dbname, &db)); + + std::string val; + ASSERT_OK(db->Get(ReadOptions(), "0", &val)); + ASSERT_EQ(val, "0"); + + ASSERT_OK(db->Get(ReadOptions(), "2", &val)); + ASSERT_EQ(val, "2"); + + ASSERT_TRUE(db->Get(ReadOptions(), "1", &val).IsNotFound()); + + // Now with extra CF, two more files + ColumnFamilyHandle* cf_handle; + ColumnFamilyOptions cf_opts; + ASSERT_OK(db->CreateColumnFamily(cf_opts, "cf1", &cf_handle)); + for (size_t i = 3; i < 5; ++i) { + ASSERT_OK(db->Put(WriteOptions(), cf_handle, std::to_string(i), + std::to_string(i))); + ASSERT_OK(db->Flush(FlushOptions(), cf_handle)); + } + + // Determine which is the "last" one + sst_files.clear(); + db->GetLiveFilesMetaData(&sst_files); + + numbers.clear(); + for (auto& f : sst_files) { + numbers.push_back(f.file_number); + } + ASSERT_EQ(numbers.size(), 4); + std::sort(numbers.begin(), numbers.end()); + to_remove = numbers.back(); + + // Close for unsafe_remove_sst_file + delete cf_handle; + delete db; + db = nullptr; + + snprintf(arg4, sizeof(arg4), "%" PRIu64, to_remove); + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + + std::vector<ColumnFamilyDescriptor> cfds = {{kDefaultColumnFamilyName, opts}, + {"cf1", cf_opts}}; + std::vector<ColumnFamilyHandle*> handles; + ASSERT_OK(DB::Open(opts, dbname, cfds, &handles, &db)); + + ASSERT_OK(db->Get(ReadOptions(), handles[1], "3", &val)); + ASSERT_EQ(val, "3"); + + ASSERT_TRUE(db->Get(ReadOptions(), handles[1], "4", &val).IsNotFound()); + + ASSERT_OK(db->Get(ReadOptions(), handles[0], "0", &val)); + ASSERT_EQ(val, "0"); + + // Determine which is the "first" one (most likely to be opened in recovery) + sst_files.clear(); + db->GetLiveFilesMetaData(&sst_files); + + numbers.clear(); + for (auto& f : sst_files) { + numbers.push_back(f.file_number); + } + ASSERT_EQ(numbers.size(), 3); + std::sort(numbers.begin(), numbers.end()); + to_remove = numbers.front(); + + // This time physically delete the file before unsafe_remove + { + std::string f = dbname + "/" + MakeTableFileName(to_remove); + ASSERT_OK(Env::Default()->DeleteFile(f)); + } + + // Close for unsafe_remove_sst_file + for (auto& h : handles) { + delete h; + } + delete db; + db = nullptr; + + snprintf(arg4, sizeof(arg4), "%" PRIu64, to_remove); + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + + ASSERT_OK(DB::Open(opts, dbname, cfds, &handles, &db)); + + ASSERT_OK(db->Get(ReadOptions(), handles[1], "3", &val)); + ASSERT_EQ(val, "3"); + + ASSERT_TRUE(db->Get(ReadOptions(), handles[0], "0", &val).IsNotFound()); + + for (auto& h : handles) { + delete h; + } + delete db; +} + +TEST_F(LdbCmdTest, FileTemperatureUpdateManifest) { + auto test_fs = std::make_shared<FileTemperatureTestFS>(FileSystem::Default()); + std::unique_ptr<Env> env(new CompositeEnvWrapper(Env::Default(), test_fs)); + Options opts; + opts.bottommost_temperature = Temperature::kWarm; + opts.level0_file_num_compaction_trigger = 10; + opts.create_if_missing = true; + opts.env = env.get(); + + DB* db = nullptr; + std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test"); + ASSERT_OK(DestroyDB(dbname, opts)); + ASSERT_OK(DB::Open(opts, dbname, &db)); + + std::array<Temperature, 5> kTestTemps = { + Temperature::kCold, Temperature::kWarm, Temperature::kHot, + Temperature::kWarm, Temperature::kCold}; + std::map<uint64_t, Temperature> number_to_temp; + for (size_t i = 0; i < kTestTemps.size(); ++i) { + ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), std::to_string(i))); + ASSERT_OK(db->Flush(FlushOptions())); + + std::map<uint64_t, Temperature> current_temps; + test_fs->CopyCurrentSstFileTemperatures(¤t_temps); + for (auto e : current_temps) { + if (e.second == Temperature::kUnknown) { + test_fs->OverrideSstFileTemperature(e.first, kTestTemps[i]); + number_to_temp[e.first] = kTestTemps[i]; + } + } + } + + // Close & reopen + delete db; + db = nullptr; + test_fs->PopRequestedSstFileTemperatures(); + ASSERT_OK(DB::Open(opts, dbname, &db)); + + for (size_t i = 0; i < kTestTemps.size(); ++i) { + std::string val; + ASSERT_OK(db->Get(ReadOptions(), std::to_string(i), &val)); + ASSERT_EQ(val, std::to_string(i)); + } + + // Still all unknown + std::vector<std::pair<uint64_t, Temperature>> requests; + test_fs->PopRequestedSstFileTemperatures(&requests); + ASSERT_EQ(requests.size(), kTestTemps.size()); + for (auto& r : requests) { + ASSERT_EQ(r.second, Temperature::kUnknown); + } + + // Close for update_manifest + delete db; + db = nullptr; + + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str()); + char arg3[] = "update_manifest"; + char arg4[] = "--update_temperatures"; + char* argv[] = {arg1, arg2, arg3, arg4}; + + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + + // Re-open, get, and verify manifest temps (based on request) + test_fs->PopRequestedSstFileTemperatures(); + ASSERT_OK(DB::Open(opts, dbname, &db)); + + for (size_t i = 0; i < kTestTemps.size(); ++i) { + std::string val; + ASSERT_OK(db->Get(ReadOptions(), std::to_string(i), &val)); + ASSERT_EQ(val, std::to_string(i)); + } + + requests.clear(); + test_fs->PopRequestedSstFileTemperatures(&requests); + ASSERT_EQ(requests.size(), kTestTemps.size()); + for (auto& r : requests) { + ASSERT_EQ(r.second, number_to_temp[r.first]); + } + delete db; +} + +TEST_F(LdbCmdTest, RenameDbAndLoadOptions) { + Env* env = TryLoadCustomOrDefaultEnv(); + Options opts; + opts.env = env; + opts.create_if_missing = false; + + std::string old_dbname = test::PerThreadDBPath(env, "ldb_cmd_test"); + std::string new_dbname = old_dbname + "_2"; + ASSERT_OK(DestroyDB(old_dbname, opts)); + ASSERT_OK(DestroyDB(new_dbname, opts)); + + char old_arg[1024]; + snprintf(old_arg, sizeof(old_arg), "--db=%s", old_dbname.c_str()); + char new_arg[1024]; + snprintf(new_arg, sizeof(old_arg), "--db=%s", new_dbname.c_str()); + const char* argv1[] = {"./ldb", + old_arg, + "put", + "key1", + "value1", + "--try_load_options", + "--create_if_missing"}; + + const char* argv2[] = {"./ldb", old_arg, "get", "key1", "--try_load_options"}; + const char* argv3[] = {"./ldb", new_arg, "put", + "key2", "value2", "--try_load_options"}; + + const char* argv4[] = {"./ldb", new_arg, "get", "key1", "--try_load_options"}; + const char* argv5[] = {"./ldb", new_arg, "get", "key2", "--try_load_options"}; + + ASSERT_EQ( + 0, LDBCommandRunner::RunCommand(7, argv1, opts, LDBOptions(), nullptr)); + ASSERT_EQ( + 0, LDBCommandRunner::RunCommand(5, argv2, opts, LDBOptions(), nullptr)); + ConfigOptions config_opts; + Options options; + std::vector<ColumnFamilyDescriptor> column_families; + config_opts.env = env; + ASSERT_OK( + LoadLatestOptions(config_opts, old_dbname, &options, &column_families)); + ASSERT_EQ(options.wal_dir, ""); + + ASSERT_OK(env->RenameFile(old_dbname, new_dbname)); + ASSERT_NE( + 0, LDBCommandRunner::RunCommand(6, argv1, opts, LDBOptions(), nullptr)); + ASSERT_NE( + 0, LDBCommandRunner::RunCommand(5, argv2, opts, LDBOptions(), nullptr)); + ASSERT_EQ( + 0, LDBCommandRunner::RunCommand(6, argv3, opts, LDBOptions(), nullptr)); + ASSERT_EQ( + 0, LDBCommandRunner::RunCommand(5, argv4, opts, LDBOptions(), nullptr)); + ASSERT_EQ( + 0, LDBCommandRunner::RunCommand(5, argv5, opts, LDBOptions(), nullptr)); + ASSERT_OK(DestroyDB(new_dbname, opts)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as LDBCommand is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/ldb_test.py b/src/rocksdb/tools/ldb_test.py new file mode 100644 index 000000000..e243d69c0 --- /dev/null +++ b/src/rocksdb/tools/ldb_test.py @@ -0,0 +1,955 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from __future__ import absolute_import, division, print_function, unicode_literals + +import glob + +import os +import os.path +import re +import shutil +import subprocess +import tempfile +import time +import unittest + + +def my_check_output(*popenargs, **kwargs): + """ + If we had python 2.7, we should simply use subprocess.check_output. + This is a stop-gap solution for python 2.6 + """ + if "stdout" in kwargs: + raise ValueError("stdout argument not allowed, it will be overridden.") + process = subprocess.Popen( + stderr=subprocess.PIPE, stdout=subprocess.PIPE, *popenargs, **kwargs + ) + output, unused_err = process.communicate() + retcode = process.poll() + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + raise Exception("Exit code is not 0. It is %d. Command: %s" % (retcode, cmd)) + return output.decode("utf-8") + + +def run_err_null(cmd): + return os.system(cmd + " 2>/dev/null ") + + +class LDBTestCase(unittest.TestCase): + def setUp(self): + self.TMP_DIR = tempfile.mkdtemp(prefix="ldb_test_") + self.DB_NAME = "testdb" + + def tearDown(self): + assert ( + self.TMP_DIR.strip() != "/" + and self.TMP_DIR.strip() != "/tmp" + and self.TMP_DIR.strip() != "/tmp/" + ) # Just some paranoia + + shutil.rmtree(self.TMP_DIR) + + def dbParam(self, dbName): + return "--db=%s" % os.path.join(self.TMP_DIR, dbName) + + def assertRunOKFull( + self, params, expectedOutput, unexpected=False, isPattern=False + ): + """ + All command-line params must be specified. + Allows full flexibility in testing; for example: missing db param. + """ + output = my_check_output( + './ldb %s |grep -v "Created bg thread"' % params, shell=True + ) + if not unexpected: + if isPattern: + self.assertNotEqual(expectedOutput.search(output.strip()), None) + else: + self.assertEqual(output.strip(), expectedOutput.strip()) + else: + if isPattern: + self.assertEqual(expectedOutput.search(output.strip()), None) + else: + self.assertNotEqual(output.strip(), expectedOutput.strip()) + + def assertRunFAILFull(self, params): + """ + All command-line params must be specified. + Allows full flexibility in testing; for example: missing db param. + """ + try: + + my_check_output( + './ldb %s >/dev/null 2>&1 |grep -v "Created bg \ + thread"' + % params, + shell=True, + ) + except Exception: + return + self.fail( + "Exception should have been raised for command with params: %s" % params + ) + + def assertRunOK(self, params, expectedOutput, unexpected=False): + """ + Uses the default test db. + """ + self.assertRunOKFull( + "%s %s" % (self.dbParam(self.DB_NAME), params), expectedOutput, unexpected + ) + + def assertRunFAIL(self, params): + """ + Uses the default test db. + """ + self.assertRunFAILFull("%s %s" % (self.dbParam(self.DB_NAME), params)) + + def testSimpleStringPutGet(self): + print("Running testSimpleStringPutGet...") + self.assertRunFAIL("put x1 y1") + self.assertRunOK("put --create_if_missing x1 y1", "OK") + self.assertRunOK("get x1", "y1") + self.assertRunFAIL("get x2") + + self.assertRunOK("put x2 y2", "OK") + self.assertRunOK("get x1", "y1") + self.assertRunOK("get x2", "y2") + self.assertRunFAIL("get x3") + + self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2") + self.assertRunOK("put x3 y3", "OK") + + self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan --from=x", "x1 : y1\nx2 : y2\nx3 : y3") + + self.assertRunOK("scan --to=x2", "x1 : y1") + self.assertRunOK("scan --from=x1 --to=z --max_keys=1", "x1 : y1") + self.assertRunOK("scan --from=x1 --to=z --max_keys=2", "x1 : y1\nx2 : y2") + + self.assertRunOK( + "scan --from=x1 --to=z --max_keys=3", "x1 : y1\nx2 : y2\nx3 : y3" + ) + self.assertRunOK( + "scan --from=x1 --to=z --max_keys=4", "x1 : y1\nx2 : y2\nx3 : y3" + ) + self.assertRunOK("scan --from=x1 --to=x2", "x1 : y1") + self.assertRunOK("scan --from=x2 --to=x4", "x2 : y2\nx3 : y3") + self.assertRunFAIL("scan --from=x4 --to=z") # No results => FAIL + self.assertRunFAIL("scan --from=x1 --to=z --max_keys=foo") + + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3") + + self.assertRunOK("delete x1", "OK") + self.assertRunOK("scan", "x2 : y2\nx3 : y3") + + self.assertRunOK("delete NonExistentKey", "OK") + # It is weird that GET and SCAN raise exception for + # non-existent key, while delete does not + + self.assertRunOK("checkconsistency", "OK") + + def dumpDb(self, params, dumpFile): + return 0 == run_err_null("./ldb dump %s > %s" % (params, dumpFile)) + + def loadDb(self, params, dumpFile): + return 0 == run_err_null("cat %s | ./ldb load %s" % (dumpFile, params)) + + def writeExternSst(self, params, inputDumpFile, outputSst): + return 0 == run_err_null( + "cat %s | ./ldb write_extern_sst %s %s" % (inputDumpFile, outputSst, params) + ) + + def ingestExternSst(self, params, inputSst): + return 0 == run_err_null("./ldb ingest_extern_sst %s %s" % (inputSst, params)) + + def testStringBatchPut(self): + print("Running testStringBatchPut...") + self.assertRunOK("batchput x1 y1 --create_if_missing", "OK") + self.assertRunOK("scan", "x1 : y1") + self.assertRunOK('batchput x2 y2 x3 y3 "x4 abc" "y4 xyz"', "OK") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz") + self.assertRunFAIL("batchput") + self.assertRunFAIL("batchput k1") + self.assertRunFAIL("batchput k1 v1 k2") + + def testBlobBatchPut(self): + print("Running testBlobBatchPut...") + + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("batchput x1 y1 --create_if_missing --enable_blob_files", "OK") + self.assertRunOK("scan", "x1 : y1") + self.assertRunOK( + 'batchput --enable_blob_files x2 y2 x3 y3 "x4 abc" "y4 xyz"', "OK" + ) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz") + + blob_files = self.getBlobFiles(dbPath) + self.assertTrue(len(blob_files) >= 1) + + def testBlobPut(self): + print("Running testBlobPut...") + + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put --create_if_missing --enable_blob_files x1 y1", "OK") + self.assertRunOK("get x1", "y1") + self.assertRunOK("put --enable_blob_files x2 y2", "OK") + self.assertRunOK("get x1", "y1") + self.assertRunOK("get x2", "y2") + self.assertRunFAIL("get x3") + + blob_files = self.getBlobFiles(dbPath) + self.assertTrue(len(blob_files) >= 1) + + def testBlobStartingLevel(self): + print("Running testBlobStartingLevel...") + + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK( + "put --create_if_missing --enable_blob_files --blob_file_starting_level=10 x1 y1", + "OK", + ) + self.assertRunOK("get x1", "y1") + + blob_files = self.getBlobFiles(dbPath) + self.assertTrue(len(blob_files) == 0) + + self.assertRunOK( + "put --enable_blob_files --blob_file_starting_level=0 x2 y2", "OK" + ) + self.assertRunOK("get x1", "y1") + self.assertRunOK("get x2", "y2") + self.assertRunFAIL("get x3") + + blob_files = self.getBlobFiles(dbPath) + self.assertTrue(len(blob_files) >= 1) + + def testCountDelimDump(self): + print("Running testCountDelimDump...") + self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK") + self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK") + self.assertRunOK( + "dump --count_delim", + "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8", + ) + self.assertRunOK( + 'dump --count_delim="."', + "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8", + ) + self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK") + self.assertRunOK( + 'dump --count_delim=","', + "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8", + ) + + def testCountDelimIDump(self): + print("Running testCountDelimIDump...") + self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK") + self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK") + self.assertRunOK( + "idump --count_delim", + "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8", + ) + self.assertRunOK( + 'idump --count_delim="."', + "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8", + ) + self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK") + self.assertRunOK( + 'idump --count_delim=","', + "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8", + ) + + def testInvalidCmdLines(self): + print("Running testInvalidCmdLines...") + # db not specified + self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing") + # No param called he + self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing") + # max_keys is not applicable for put + self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing") + # hex has invalid boolean value + + def testHexPutGet(self): + print("Running testHexPutGet...") + self.assertRunOK("put a1 b1 --create_if_missing", "OK") + self.assertRunOK("scan", "a1 : b1") + self.assertRunOK("scan --hex", "0x6131 : 0x6231") + self.assertRunFAIL("put --hex 6132 6232") + self.assertRunOK("put --hex 0x6132 0x6232", "OK") + self.assertRunOK("scan --hex", "0x6131 : 0x6231\n0x6132 : 0x6232") + self.assertRunOK("scan", "a1 : b1\na2 : b2") + self.assertRunOK("get a1", "b1") + self.assertRunOK("get --hex 0x6131", "0x6231") + self.assertRunOK("get a2", "b2") + self.assertRunOK("get --hex 0x6132", "0x6232") + self.assertRunOK("get --key_hex 0x6132", "b2") + self.assertRunOK("get --key_hex --value_hex 0x6132", "0x6232") + self.assertRunOK("get --value_hex a2", "0x6232") + self.assertRunOK( + "scan --key_hex --value_hex", "0x6131 : 0x6231\n0x6132 : 0x6232" + ) + self.assertRunOK( + "scan --hex --from=0x6131 --to=0x6133", "0x6131 : 0x6231\n0x6132 : 0x6232" + ) + self.assertRunOK("scan --hex --from=0x6131 --to=0x6132", "0x6131 : 0x6231") + self.assertRunOK("scan --key_hex", "0x6131 : b1\n0x6132 : b2") + self.assertRunOK("scan --value_hex", "a1 : 0x6231\na2 : 0x6232") + self.assertRunOK("batchput --hex 0x6133 0x6233 0x6134 0x6234", "OK") + self.assertRunOK("scan", "a1 : b1\na2 : b2\na3 : b3\na4 : b4") + self.assertRunOK("delete --hex 0x6133", "OK") + self.assertRunOK("scan", "a1 : b1\na2 : b2\na4 : b4") + self.assertRunOK("checkconsistency", "OK") + + def testTtlPutGet(self): + print("Running testTtlPutGet...") + self.assertRunOK("put a1 b1 --ttl --create_if_missing", "OK") + self.assertRunOK("scan --hex", "0x6131 : 0x6231", True) + self.assertRunOK("dump --ttl ", "a1 ==> b1", True) + self.assertRunOK("dump --hex --ttl ", "0x6131 ==> 0x6231\nKeys in range: 1") + self.assertRunOK("scan --hex --ttl", "0x6131 : 0x6231") + self.assertRunOK("get --value_hex a1", "0x6231", True) + self.assertRunOK("get --ttl a1", "b1") + self.assertRunOK("put a3 b3 --create_if_missing", "OK") + # fails because timstamp's length is greater than value's + self.assertRunFAIL("get --ttl a3") + self.assertRunOK("checkconsistency", "OK") + + def testInvalidCmdLines(self): # noqa: F811 T25377293 Grandfathered in + print("Running testInvalidCmdLines...") + # db not specified + self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing") + # No param called he + self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing") + # max_keys is not applicable for put + self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing") + # hex has invalid boolean value + self.assertRunFAIL("put 0x6133 0x6233 --hex=Boo --create_if_missing") + + def testDumpLoad(self): + print("Running testDumpLoad...") + self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4", "OK") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + + # Dump and load without any additional params specified + dumpFilePath = os.path.join(self.TMP_DIR, "dump1") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump1") + self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath)) + self.assertTrue( + self.loadDb("--db=%s --create_if_missing" % loadedDbPath, dumpFilePath) + ) + self.assertRunOKFull( + "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4" + ) + + # Dump and load in hex + dumpFilePath = os.path.join(self.TMP_DIR, "dump2") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump2") + self.assertTrue(self.dumpDb("--db=%s --hex" % origDbPath, dumpFilePath)) + self.assertTrue( + self.loadDb( + "--db=%s --hex --create_if_missing" % loadedDbPath, dumpFilePath + ) + ) + self.assertRunOKFull( + "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4" + ) + + # Dump only a portion of the key range + dumpFilePath = os.path.join(self.TMP_DIR, "dump3") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump3") + self.assertTrue( + self.dumpDb("--db=%s --from=x1 --to=x3" % origDbPath, dumpFilePath) + ) + self.assertTrue( + self.loadDb("--db=%s --create_if_missing" % loadedDbPath, dumpFilePath) + ) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2") + + # Dump upto max_keys rows + dumpFilePath = os.path.join(self.TMP_DIR, "dump4") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump4") + self.assertTrue(self.dumpDb("--db=%s --max_keys=3" % origDbPath, dumpFilePath)) + self.assertTrue( + self.loadDb("--db=%s --create_if_missing" % loadedDbPath, dumpFilePath) + ) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3") + + # Load into an existing db, create_if_missing is not specified + self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb("--db=%s" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull( + "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4" + ) + + # Dump and load with WAL disabled + dumpFilePath = os.path.join(self.TMP_DIR, "dump5") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump5") + self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath)) + self.assertTrue( + self.loadDb( + "--db=%s --disable_wal --create_if_missing" % loadedDbPath, dumpFilePath + ) + ) + self.assertRunOKFull( + "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4" + ) + + # Dump and load with lots of extra params specified + extraParams = " ".join( + [ + "--bloom_bits=14", + "--block_size=1024", + "--auto_compaction=true", + "--write_buffer_size=4194304", + "--file_size=2097152", + ] + ) + dumpFilePath = os.path.join(self.TMP_DIR, "dump6") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump6") + self.assertTrue( + self.dumpDb("--db=%s %s" % (origDbPath, extraParams), dumpFilePath) + ) + self.assertTrue( + self.loadDb( + "--db=%s %s --create_if_missing" % (loadedDbPath, extraParams), + dumpFilePath, + ) + ) + self.assertRunOKFull( + "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4" + ) + + # Dump with count_only + dumpFilePath = os.path.join(self.TMP_DIR, "dump7") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump7") + self.assertTrue(self.dumpDb("--db=%s --count_only" % origDbPath, dumpFilePath)) + self.assertTrue( + self.loadDb("--db=%s --create_if_missing" % loadedDbPath, dumpFilePath) + ) + # DB should have atleast one value for scan to work + self.assertRunOKFull("put --db=%s k1 v1" % loadedDbPath, "OK") + self.assertRunOKFull("scan --db=%s" % loadedDbPath, "k1 : v1") + + # Dump command fails because of typo in params + dumpFilePath = os.path.join(self.TMP_DIR, "dump8") + self.assertFalse( + self.dumpDb("--db=%s --create_if_missing" % origDbPath, dumpFilePath) + ) + + # Dump and load with BlobDB enabled + blobParams = " ".join( + ["--enable_blob_files", "--min_blob_size=1", "--blob_file_size=2097152"] + ) + dumpFilePath = os.path.join(self.TMP_DIR, "dump9") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump9") + self.assertTrue(self.dumpDb("--db=%s" % (origDbPath), dumpFilePath)) + self.assertTrue( + self.loadDb( + "--db=%s %s --create_if_missing --disable_wal" + % (loadedDbPath, blobParams), + dumpFilePath, + ) + ) + self.assertRunOKFull( + "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4" + ) + blob_files = self.getBlobFiles(loadedDbPath) + self.assertTrue(len(blob_files) >= 1) + + def testIDumpBasics(self): + print("Running testIDumpBasics...") + self.assertRunOK("put a val --create_if_missing", "OK") + self.assertRunOK("put b val", "OK") + self.assertRunOK( + "idump", + "'a' seq:1, type:1 => val\n" + "'b' seq:2, type:1 => val\nInternal keys in range: 2", + ) + self.assertRunOK( + "idump --input_key_hex --from=%s --to=%s" % (hex(ord("a")), hex(ord("b"))), + "'a' seq:1, type:1 => val\nInternal keys in range: 1", + ) + + def testIDumpDecodeBlobIndex(self): + print("Running testIDumpDecodeBlobIndex...") + self.assertRunOK("put a val --create_if_missing", "OK") + self.assertRunOK("put b val --enable_blob_files", "OK") + + # Pattern to expect from dump with decode_blob_index flag enabled. + regex = ".*\[blob ref\].*" + expected_pattern = re.compile(regex) + cmd = "idump %s --decode_blob_index" + self.assertRunOKFull( + (cmd) % (self.dbParam(self.DB_NAME)), + expected_pattern, + unexpected=False, + isPattern=True, + ) + + def testMiscAdminTask(self): + print("Running testMiscAdminTask...") + # These tests need to be improved; for example with asserts about + # whether compaction or level reduction actually took place. + self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4", "OK") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + + self.assertTrue(0 == run_err_null("./ldb compact --db=%s" % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue( + 0 == run_err_null("./ldb reduce_levels --db=%s --new_levels=2" % origDbPath) + ) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue( + 0 == run_err_null("./ldb reduce_levels --db=%s --new_levels=3" % origDbPath) + ) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue( + 0 == run_err_null("./ldb compact --db=%s --from=x1 --to=x3" % origDbPath) + ) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue( + 0 + == run_err_null( + "./ldb compact --db=%s --hex --from=0x6131 --to=0x6134" % origDbPath + ) + ) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # TODO(dilip): Not sure what should be passed to WAL.Currently corrupted. + self.assertTrue( + 0 + == run_err_null( + "./ldb dump_wal --db=%s --walfile=%s --header" + % (origDbPath, os.path.join(origDbPath, "LOG")) + ) + ) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + def testCheckConsistency(self): + print("Running testCheckConsistency...") + + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put x1 y1 --create_if_missing", "OK") + self.assertRunOK("put x2 y2", "OK") + self.assertRunOK("get x1", "y1") + self.assertRunOK("checkconsistency", "OK") + + sstFilePath = my_check_output( + "ls %s" % os.path.join(dbPath, "*.sst"), shell=True + ) + + # Modify the file + my_check_output("echo 'evil' > %s" % sstFilePath, shell=True) + self.assertRunFAIL("checkconsistency") + + # Delete the file + my_check_output("rm -f %s" % sstFilePath, shell=True) + self.assertRunFAIL("checkconsistency") + + def dumpLiveFiles(self, params, dumpFile): + return 0 == run_err_null("./ldb dump_live_files %s > %s" % (params, dumpFile)) + + def testDumpLiveFiles(self): + print("Running testDumpLiveFiles...") + + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put x1 y1 --create_if_missing", "OK") + self.assertRunOK("put x2 y2 --enable_blob_files", "OK") + dumpFilePath = os.path.join(self.TMP_DIR, "dump1") + self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath)) + self.assertRunOK("delete x1", "OK") + self.assertRunOK("put x3 y3", "OK") + dumpFilePath = os.path.join(self.TMP_DIR, "dump2") + + # Test that if the user provides a db path that ends with + # a slash '/', there is no double (or more!) slashes in the + # SST and manifest file names. + + # Add a '/' at the end of dbPath (which normally shouldnt contain any) + if dbPath[-1] != "/": + dbPath += "/" + + # Call the dump_live_files function with the edited dbPath name. + self.assertTrue( + self.dumpLiveFiles( + "--db=%s --decode_blob_index --dump_uncompressed_blobs" % dbPath, + dumpFilePath, + ) + ) + + # Investigate the output + with open(dumpFilePath, "r") as tmp: + data = tmp.read() + + # Check that all the SST filenames have a correct full path (no multiple '/'). + sstFileList = re.findall(r"%s.*\d+.sst" % dbPath, data) + self.assertTrue(len(sstFileList) >= 1) + for sstFilename in sstFileList: + filenumber = re.findall(r"\d+.sst", sstFilename)[0] + self.assertEqual(sstFilename, dbPath + filenumber) + + # Check that all the Blob filenames have a correct full path (no multiple '/'). + blobFileList = re.findall(r"%s.*\d+.blob" % dbPath, data) + self.assertTrue(len(blobFileList) >= 1) + for blobFilename in blobFileList: + filenumber = re.findall(r"\d+.blob", blobFilename)[0] + self.assertEqual(blobFilename, dbPath + filenumber) + + # Check that all the manifest filenames + # have a correct full path (no multiple '/'). + manifestFileList = re.findall(r"%s.*MANIFEST-\d+" % dbPath, data) + self.assertTrue(len(manifestFileList) >= 1) + for manifestFilename in manifestFileList: + filenumber = re.findall(r"(?<=MANIFEST-)\d+", manifestFilename)[0] + self.assertEqual(manifestFilename, dbPath + "MANIFEST-" + filenumber) + + # Check that the blob file index is decoded. + decodedBlobIndex = re.findall(r"\[blob ref\]", data) + self.assertTrue(len(decodedBlobIndex) >= 1) + + def listLiveFilesMetadata(self, params, dumpFile): + return 0 == run_err_null( + "./ldb list_live_files_metadata %s > %s" % (params, dumpFile) + ) + + def testListLiveFilesMetadata(self): + print("Running testListLiveFilesMetadata...") + + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put x1 y1 --create_if_missing", "OK") + self.assertRunOK("put x2 y2", "OK") + + # Compare the SST filename and the level of list_live_files_metadata + # with the data collected from dump_live_files. + dumpFilePath1 = os.path.join(self.TMP_DIR, "dump1") + self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath1)) + dumpFilePath2 = os.path.join(self.TMP_DIR, "dump2") + self.assertTrue( + self.listLiveFilesMetadata( + "--sort_by_filename --db=%s" % dbPath, dumpFilePath2 + ) + ) + + # Collect SST filename and level from dump_live_files + with open(dumpFilePath1, "r") as tmp: + data = tmp.read() + filename1 = re.findall(r".*\d+\.sst", data)[0] + level1 = re.findall(r"level:\d+", data)[0].split(":")[1] + + # Collect SST filename and level from list_live_files_metadata + with open(dumpFilePath2, "r") as tmp: + data = tmp.read() + filename2 = re.findall(r".*\d+\.sst", data)[0] + level2 = re.findall(r"level \d+", data)[0].split(" ")[1] + + # Assert equality between filenames and levels. + self.assertEqual(filename1, filename2) + self.assertEqual(level1, level2) + + # Create multiple column families and compare the output + # of list_live_files_metadata with dump_live_files once again. + # Create new CF, and insert data: + self.assertRunOK("create_column_family mycol1", "OK") + self.assertRunOK("put --column_family=mycol1 v1 v2", "OK") + self.assertRunOK("create_column_family mycol2", "OK") + self.assertRunOK("put --column_family=mycol2 h1 h2", "OK") + self.assertRunOK("put --column_family=mycol2 h3 h4", "OK") + + # Call dump_live_files and list_live_files_metadata + # and pipe the output to compare them later. + dumpFilePath3 = os.path.join(self.TMP_DIR, "dump3") + self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath3)) + dumpFilePath4 = os.path.join(self.TMP_DIR, "dump4") + self.assertTrue( + self.listLiveFilesMetadata( + "--sort_by_filename --db=%s" % dbPath, dumpFilePath4 + ) + ) + + # dump_live_files: + # parse the output and create a map: + # [key: sstFilename]->[value:[LSM level, Column Family Name]] + referenceMap = {} + with open(dumpFilePath3, "r") as tmp: + data = tmp.read() + # Note: the following regex are contingent on what the + # dump_live_files outputs. + namesAndLevels = re.findall(r"\d+.sst level:\d+", data) + cfs = re.findall(r"(?<=column family name=)\w+", data) + # re.findall should not reorder the data. + # Therefore namesAndLevels[i] matches the data from cfs[i]. + for count, nameAndLevel in enumerate(namesAndLevels): + sstFilename = re.findall(r"\d+.sst", nameAndLevel)[0] + sstLevel = re.findall(r"(?<=level:)\d+", nameAndLevel)[0] + cf = cfs[count] + referenceMap[sstFilename] = [sstLevel, cf] + + # list_live_files_metadata: + # parse the output and create a map: + # [key: sstFilename]->[value:[LSM level, Column Family Name]] + testMap = {} + with open(dumpFilePath4, "r") as tmp: + data = tmp.read() + # Since for each SST file, all the information is contained + # on one line, the parsing is easy to perform and relies on + # the appearance of an "00xxx.sst" pattern. + sstLines = re.findall(r".*\d+.sst.*", data) + for line in sstLines: + sstFilename = re.findall(r"\d+.sst", line)[0] + sstLevel = re.findall(r"(?<=level )\d+", line)[0] + cf = re.findall(r"(?<=column family \')\w+(?=\')", line)[0] + testMap[sstFilename] = [sstLevel, cf] + + # Compare the map obtained from dump_live_files and the map + # obtained from list_live_files_metadata. Everything should match. + self.assertEqual(referenceMap, testMap) + + def getManifests(self, directory): + return glob.glob(directory + "/MANIFEST-*") + + def getSSTFiles(self, directory): + return glob.glob(directory + "/*.sst") + + def getWALFiles(self, directory): + return glob.glob(directory + "/*.log") + + def getBlobFiles(self, directory): + return glob.glob(directory + "/*.blob") + + def copyManifests(self, src, dest): + return 0 == run_err_null("cp " + src + " " + dest) + + def testManifestDump(self): + print("Running testManifestDump...") + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put 1 1 --create_if_missing", "OK") + self.assertRunOK("put 2 2", "OK") + self.assertRunOK("put 3 3", "OK") + # Pattern to expect from manifest_dump. + num = "[0-9]+" + st = ".*" + subpat = st + " seq:" + num + ", type:" + num + regex = num + ":" + num + "\[" + subpat + ".." + subpat + "\]" + expected_pattern = re.compile(regex) + cmd = "manifest_dump --db=%s" + manifest_files = self.getManifests(dbPath) + self.assertTrue(len(manifest_files) == 1) + # Test with the default manifest file in dbPath. + self.assertRunOKFull( + cmd % dbPath, expected_pattern, unexpected=False, isPattern=True + ) + self.copyManifests(manifest_files[0], manifest_files[0] + "1") + manifest_files = self.getManifests(dbPath) + self.assertTrue(len(manifest_files) == 2) + # Test with multiple manifest files in dbPath. + self.assertRunFAILFull(cmd % dbPath) + # Running it with the copy we just created should pass. + self.assertRunOKFull( + (cmd + " --path=%s") % (dbPath, manifest_files[1]), + expected_pattern, + unexpected=False, + isPattern=True, + ) + # Make sure that using the dump with --path will result in identical + # output as just using manifest_dump. + cmd = "dump --path=%s" + self.assertRunOKFull( + (cmd) % (manifest_files[1]), + expected_pattern, + unexpected=False, + isPattern=True, + ) + + # Check if null characters doesn't infer with output format. + self.assertRunOK("put a1 b1", "OK") + self.assertRunOK("put a2 b2", "OK") + self.assertRunOK("put --hex 0x12000DA0 0x80C0000B", "OK") + self.assertRunOK("put --hex 0x7200004f 0x80000004", "OK") + self.assertRunOK("put --hex 0xa000000a 0xf000000f", "OK") + self.assertRunOK("put a3 b3", "OK") + self.assertRunOK("put a4 b4", "OK") + + # Verifies that all "levels" are printed out. + # There should be 66 mentions of levels. + expected_verbose_output = re.compile("matched") + # Test manifest_dump verbose and verify that key 0x7200004f + # is present. Note that we are forced to use grep here because + # an output with a non-terminating null character in it isn't piped + # correctly through the Python subprocess object. + # Also note that 0x72=r and 0x4f=O, hence the regex \'r.{2}O\' + # (we cannot use null character in the subprocess input either, + # so we have to use '.{2}') + cmd_verbose = ( + "manifest_dump --verbose --db=%s | grep -aq $''r.{2}O'' && echo 'matched' || echo 'not matched'" + % dbPath + ) + + self.assertRunOKFull( + cmd_verbose, expected_verbose_output, unexpected=False, isPattern=True + ) + + def testGetProperty(self): + print("Running testGetProperty...") + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put 1 1 --create_if_missing", "OK") + self.assertRunOK("put 2 2", "OK") + # A "string" property + cmd = "--db=%s get_property rocksdb.estimate-num-keys" + self.assertRunOKFull(cmd % dbPath, "rocksdb.estimate-num-keys: 2") + # A "map" property + # FIXME: why doesn't this pick up two entries? + cmd = "--db=%s get_property rocksdb.aggregated-table-properties" + part = "rocksdb.aggregated-table-properties.num_entries: " + expected_pattern = re.compile(part) + self.assertRunOKFull( + cmd % dbPath, expected_pattern, unexpected=False, isPattern=True + ) + # An invalid property + cmd = "--db=%s get_property rocksdb.this-property-does-not-exist" + self.assertRunFAILFull(cmd % dbPath) + + def testSSTDump(self): + print("Running testSSTDump...") + + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put sst1 sst1_val --create_if_missing", "OK") + self.assertRunOK("put sst2 sst2_val --enable_blob_files", "OK") + self.assertRunOK("get sst1", "sst1_val") + + # Pattern to expect from SST dump. + regex = ".*Sst file format:.*\n.*\[blob ref\].*" + expected_pattern = re.compile(regex) + + sst_files = self.getSSTFiles(dbPath) + self.assertTrue(len(sst_files) >= 1) + cmd = "dump --path=%s --decode_blob_index" + self.assertRunOKFull( + (cmd) % (sst_files[0]), expected_pattern, unexpected=False, isPattern=True + ) + + def testBlobDump(self): + print("Running testBlobDump") + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("batchput x1 y1 --create_if_missing --enable_blob_files", "OK") + self.assertRunOK( + 'batchput --enable_blob_files x2 y2 x3 y3 "x4 abc" "y4 xyz"', "OK" + ) + + # Pattern to expect from blob file dump. + regex = ".*Blob log header[\s\S]*Blob log footer[\s\S]*Read record[\s\S]*Summary" # noqa + expected_pattern = re.compile(regex) + blob_files = self.getBlobFiles(dbPath) + self.assertTrue(len(blob_files) >= 1) + cmd = "dump --path=%s --dump_uncompressed_blobs" + self.assertRunOKFull( + (cmd) % (blob_files[0]), expected_pattern, unexpected=False, isPattern=True + ) + + def testWALDump(self): + print("Running testWALDump...") + + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put wal1 wal1_val --create_if_missing", "OK") + self.assertRunOK("put wal2 wal2_val", "OK") + self.assertRunOK("get wal1", "wal1_val") + + # Pattern to expect from WAL dump. + regex = "^Sequence,Count,ByteSize,Physical Offset,Key\(s\).*" + expected_pattern = re.compile(regex) + + wal_files = self.getWALFiles(dbPath) + self.assertTrue(len(wal_files) >= 1) + cmd = "dump --path=%s" + self.assertRunOKFull( + (cmd) % (wal_files[0]), expected_pattern, unexpected=False, isPattern=True + ) + + def testListColumnFamilies(self): + print("Running testListColumnFamilies...") + self.assertRunOK("put x1 y1 --create_if_missing", "OK") + cmd = 'list_column_families | grep -v "Column families"' + # Test on valid dbPath. + self.assertRunOK(cmd, "{default}") + # Test on empty path. + self.assertRunFAIL(cmd) + + def testColumnFamilies(self): + print("Running testColumnFamilies...") + _ = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put cf1_1 1 --create_if_missing", "OK") + self.assertRunOK("put cf1_2 2 --create_if_missing", "OK") + self.assertRunOK("put cf1_3 3 --try_load_options", "OK") + # Given non-default column family to single CF DB. + self.assertRunFAIL("get cf1_1 --column_family=two") + self.assertRunOK("create_column_family two", "OK") + self.assertRunOK("put cf2_1 1 --create_if_missing --column_family=two", "OK") + self.assertRunOK("put cf2_2 2 --create_if_missing --column_family=two", "OK") + self.assertRunOK("delete cf1_2", "OK") + self.assertRunOK("create_column_family three", "OK") + self.assertRunOK("delete cf2_2 --column_family=two", "OK") + self.assertRunOK("put cf3_1 3 --create_if_missing --column_family=three", "OK") + self.assertRunOK("get cf1_1 --column_family=default", "1") + self.assertRunOK("dump --column_family=two", "cf2_1 ==> 1\nKeys in range: 1") + self.assertRunOK( + "dump --column_family=two --try_load_options", + "cf2_1 ==> 1\nKeys in range: 1", + ) + self.assertRunOK("dump", "cf1_1 ==> 1\ncf1_3 ==> 3\nKeys in range: 2") + self.assertRunOK("get cf2_1 --column_family=two", "1") + self.assertRunOK("get cf3_1 --column_family=three", "3") + self.assertRunOK("drop_column_family three", "OK") + # non-existing column family. + self.assertRunFAIL("get cf3_1 --column_family=four") + self.assertRunFAIL("drop_column_family four") + + def testIngestExternalSst(self): + print("Running testIngestExternalSst...") + + # Dump, load, write external sst and ingest it in another db + dbPath = os.path.join(self.TMP_DIR, "db1") + self.assertRunOK( + "batchput --db=%s --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4" % dbPath, + "OK", + ) + self.assertRunOK("scan --db=%s" % dbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + dumpFilePath = os.path.join(self.TMP_DIR, "dump1") + with open(dumpFilePath, "w") as f: + f.write("x1 ==> y10\nx2 ==> y20\nx3 ==> y30\nx4 ==> y40") + externSstPath = os.path.join(self.TMP_DIR, "extern_data1.sst") + self.assertTrue( + self.writeExternSst( + "--create_if_missing --db=%s" % dbPath, dumpFilePath, externSstPath + ) + ) + # cannot ingest if allow_global_seqno is false + self.assertFalse( + self.ingestExternSst( + "--create_if_missing --allow_global_seqno=false --db=%s" % dbPath, + externSstPath, + ) + ) + self.assertTrue( + self.ingestExternSst( + "--create_if_missing --allow_global_seqno --db=%s" % dbPath, + externSstPath, + ) + ) + self.assertRunOKFull( + "scan --db=%s" % dbPath, "x1 : y10\nx2 : y20\nx3 : y30\nx4 : y40" + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/rocksdb/tools/ldb_tool.cc b/src/rocksdb/tools/ldb_tool.cc new file mode 100644 index 000000000..eadb6a095 --- /dev/null +++ b/src/rocksdb/tools/ldb_tool.cc @@ -0,0 +1,184 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE +#include "rocksdb/ldb_tool.h" + +#include "rocksdb/utilities/ldb_cmd.h" +#include "tools/ldb_cmd_impl.h" + +namespace ROCKSDB_NAMESPACE { + +LDBOptions::LDBOptions() {} + +void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options, + const char* /*exec_name*/, bool to_stderr) { + std::string ret; + + ret.append(ldb_options.print_help_header); + ret.append("\n\n"); + ret.append("commands MUST specify --" + LDBCommand::ARG_DB + + "=<full_path_to_db_directory> when necessary\n"); + ret.append("\n"); + ret.append("commands can optionally specify\n"); + ret.append(" --" + LDBCommand::ARG_ENV_URI + "=<uri_of_environment> or --" + + LDBCommand::ARG_FS_URI + "=<uri_of_filesystem> if necessary"); + ret.append("\n"); + ret.append(" --" + LDBCommand::ARG_SECONDARY_PATH + + "=<secondary_path> to open DB as secondary instance. Operations " + "not supported in secondary instance will fail.\n\n"); + ret.append( + "The following optional parameters control if keys/values are " + "input/output as hex or as plain strings:\n"); + ret.append(" --" + LDBCommand::ARG_KEY_HEX + + " : Keys are input/output as hex\n"); + ret.append(" --" + LDBCommand::ARG_VALUE_HEX + + " : Values are input/output as hex\n"); + ret.append(" --" + LDBCommand::ARG_HEX + + " : Both keys and values are input/output as hex\n"); + ret.append("\n"); + + ret.append( + "The following optional parameters control the database " + "internals:\n"); + ret.append( + " --" + LDBCommand::ARG_CF_NAME + + "=<string> : name of the column family to operate on. default: default " + "column family\n"); + ret.append(" --" + LDBCommand::ARG_TTL + + " with 'put','get','scan','dump','query','batchput'" + " : DB supports ttl and value is internally timestamp-suffixed\n"); + ret.append(" --" + LDBCommand::ARG_TRY_LOAD_OPTIONS + + " : Try to load option file from DB. Default to true if " + + LDBCommand::ARG_DB + + " is specified and not creating a new DB and not open as TTL DB. " + "Can be set to false explicitly.\n"); + ret.append(" --" + LDBCommand::ARG_DISABLE_CONSISTENCY_CHECKS + + " : Set options.force_consistency_checks = false.\n"); + ret.append(" --" + LDBCommand::ARG_IGNORE_UNKNOWN_OPTIONS + + " : Ignore unknown options when loading option file.\n"); + ret.append(" --" + LDBCommand::ARG_BLOOM_BITS + "=<int,e.g.:14>\n"); + ret.append(" --" + LDBCommand::ARG_FIX_PREFIX_LEN + "=<int,e.g.:14>\n"); + ret.append(" --" + LDBCommand::ARG_COMPRESSION_TYPE + + "=<no|snappy|zlib|bzip2|lz4|lz4hc|xpress|zstd>\n"); + ret.append(" --" + LDBCommand::ARG_COMPRESSION_MAX_DICT_BYTES + + "=<int,e.g.:16384>\n"); + ret.append(" --" + LDBCommand::ARG_BLOCK_SIZE + "=<block_size_in_bytes>\n"); + ret.append(" --" + LDBCommand::ARG_AUTO_COMPACTION + "=<true|false>\n"); + ret.append(" --" + LDBCommand::ARG_DB_WRITE_BUFFER_SIZE + + "=<int,e.g.:16777216>\n"); + ret.append(" --" + LDBCommand::ARG_WRITE_BUFFER_SIZE + + "=<int,e.g.:4194304>\n"); + ret.append(" --" + LDBCommand::ARG_FILE_SIZE + "=<int,e.g.:2097152>\n"); + ret.append(" --" + LDBCommand::ARG_ENABLE_BLOB_FILES + + " : Enable key-value separation using BlobDB\n"); + ret.append(" --" + LDBCommand::ARG_MIN_BLOB_SIZE + "=<int,e.g.:2097152>\n"); + ret.append(" --" + LDBCommand::ARG_BLOB_FILE_SIZE + "=<int,e.g.:2097152>\n"); + ret.append(" --" + LDBCommand::ARG_BLOB_COMPRESSION_TYPE + + "=<no|snappy|zlib|bzip2|lz4|lz4hc|xpress|zstd>\n"); + ret.append(" --" + LDBCommand::ARG_ENABLE_BLOB_GARBAGE_COLLECTION + + " : Enable blob garbage collection\n"); + ret.append(" --" + LDBCommand::ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF + + "=<double,e.g.:0.25>\n"); + ret.append(" --" + LDBCommand::ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD + + "=<double,e.g.:0.25>\n"); + ret.append(" --" + LDBCommand::ARG_BLOB_COMPACTION_READAHEAD_SIZE + + "=<int,e.g.:2097152>\n"); + + ret.append("\n\n"); + ret.append("Data Access Commands:\n"); + PutCommand::Help(ret); + GetCommand::Help(ret); + BatchPutCommand::Help(ret); + ScanCommand::Help(ret); + DeleteCommand::Help(ret); + DeleteRangeCommand::Help(ret); + DBQuerierCommand::Help(ret); + ApproxSizeCommand::Help(ret); + CheckConsistencyCommand::Help(ret); + ListFileRangeDeletesCommand::Help(ret); + + ret.append("\n\n"); + ret.append("Admin Commands:\n"); + WALDumperCommand::Help(ret); + CompactorCommand::Help(ret); + ReduceDBLevelsCommand::Help(ret); + ChangeCompactionStyleCommand::Help(ret); + DBDumperCommand::Help(ret); + DBLoaderCommand::Help(ret); + ManifestDumpCommand::Help(ret); + UpdateManifestCommand::Help(ret); + FileChecksumDumpCommand::Help(ret); + GetPropertyCommand::Help(ret); + ListColumnFamiliesCommand::Help(ret); + CreateColumnFamilyCommand::Help(ret); + DropColumnFamilyCommand::Help(ret); + DBFileDumperCommand::Help(ret); + InternalDumpCommand::Help(ret); + DBLiveFilesMetadataDumperCommand::Help(ret); + RepairCommand::Help(ret); + BackupCommand::Help(ret); + RestoreCommand::Help(ret); + CheckPointCommand::Help(ret); + WriteExternalSstFilesCommand::Help(ret); + IngestExternalSstFilesCommand::Help(ret); + UnsafeRemoveSstFileCommand::Help(ret); + + fprintf(to_stderr ? stderr : stdout, "%s\n", ret.c_str()); +} + +int LDBCommandRunner::RunCommand( + int argc, char const* const* argv, Options options, + const LDBOptions& ldb_options, + const std::vector<ColumnFamilyDescriptor>* column_families) { + if (argc <= 2) { + if (argc <= 1) { + PrintHelp(ldb_options, argv[0], /*to_stderr*/ true); + return 1; + } else if (std::string(argv[1]) == "--version") { + printf("ldb from RocksDB %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR, + ROCKSDB_PATCH); + return 0; + } else if (std::string(argv[1]) == "--help") { + PrintHelp(ldb_options, argv[0], /*to_stderr*/ false); + return 0; + } else { + PrintHelp(ldb_options, argv[0], /*to_stderr*/ true); + return 1; + } + } + + LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs( + argc, argv, options, ldb_options, column_families); + if (cmdObj == nullptr) { + fprintf(stderr, "Unknown command\n"); + PrintHelp(ldb_options, argv[0], /*to_stderr*/ true); + return 1; + } + + if (!cmdObj->ValidateCmdLineOptions()) { + return 1; + } + + cmdObj->Run(); + LDBCommandExecuteResult ret = cmdObj->GetExecuteState(); + if (!ret.ToString().empty()) { + fprintf(stderr, "%s\n", ret.ToString().c_str()); + } + delete cmdObj; + + return ret.IsFailed() ? 1 : 0; +} + +void LDBTool::Run(int argc, char** argv, Options options, + const LDBOptions& ldb_options, + const std::vector<ColumnFamilyDescriptor>* column_families) { + int error_code = LDBCommandRunner::RunCommand(argc, argv, options, + ldb_options, column_families); + exit(error_code); +} +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/pflag b/src/rocksdb/tools/pflag new file mode 100755 index 000000000..f3394a666 --- /dev/null +++ b/src/rocksdb/tools/pflag @@ -0,0 +1,217 @@ +#!/usr/bin/env bash +# +#(c) 2004-present, Facebook, all rights reserved. +# See the LICENSE file for usage and distribution rights. +# + +trap 'echo "Caught exception, dying"; exit' 1 2 3 15 + +ME=`basename $0` +SERVER=`hostname` + +#parameters used +# +Dump_Config=0 +DEBUG= +OS=`/bin/uname -s` +VMEM= +RSS= +CPU= +VERBOSE= +VAR= +LIMIT= +ACTION= +N= +WAIT= + +# +#supported OS: Linux only for now. Easy to add +# +oscheck() { + case ${OS} in + Linux) + VMEM=vsz + RSS=rss + CPU=bsdtime + ;; + *) + die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks." + ;; + esac +} + + +verbose() { + if [ "x$DEBUG" != "x" ]; then + echo "$@" >&2 + fi +} + +warn() { + echo "$@" >&2 +} + +die() { + echo "ERROR: " "$@" >&2; + exit; +} + +dump_config() { + cat <<EOCONFIG; +$ME running on ${HOSTNAME} at `date` + +Configuration for this run: + PID to monitor : ${PID} + Resource monitored : ${VAR} + Resource limit : ${LIMIT} + Check every : ${WAIT} seconds + No. of times run : ${N} + What to do : ${ACTION} +EOCONFIG + +} + +usage() { + cat <<USAGE; exit +$@ + +Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait] + +Monitor a process for set of violations. Options: + + -p: PID of process to monitor + + -x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM + + -l: what is the threshold/limit for the metric that is being sensed. + Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU + NOTE: defaults to 1GB + + -a: action. Currently {warn|die|kill} are supported. + The default action is to 'warn'. Here is the behavior: + + warn: complain if usage exceeds threshold, but continue monitoring + kill: complain, kill the db_bench process and exit + die: if usage exceeds threshold, die immediately + + -n: number of cycles to monitor. Default is to monitor until PID no longer exists. + + -w: wait time per cycle of monitoring. Default is 5 seconds. + + -v: verbose messaging + +USAGE + +} + +#set default values if none given +set_defaults_if_noopt_given() { + + : ${VAR:=vsz} + : ${LIMIT:=1024000} + : ${WAIT:=5} + : ${N:=999999} + : ${ACTION:=warn} +} + +validate_options() { + if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then + usage "PID is mandatory" + fi +} + +###### START + + + while getopts ":p:x:l:a:n:t:vhd" opt; do + case $opt in + d) + Dump_Config=1 + ;; + h) + usage; + ;; + a) + ACTION=${OPTARG}; + ;; + v) + DEBUG=1; + ;; + p) + PID=$OPTARG; + ;; + x) + VAR=$OPTARG; + ;; + l) + LIMIT=$OPTARG; + ;; + w) + WAIT=$OPTARG; + ;; + n) + N=$OPTARG; + ;; + \?) + usage; + ;; + esac + done + +oscheck; +set_defaults_if_noopt_given; +validate_options; + +if [ $Dump_Config -eq 1 ]; then + dump_config; + exit; +fi + +Done=0 + +verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration"; + +while [ $Done -eq 0 ]; do + VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'` + if [ ${VAL:=0} -eq 0 ]; then + warn "Process $PID ended without incident." + Done=1; + break; + fi + + if [ $VAL -ge $LIMIT ]; then + Done=1; + else + echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}" + sleep $WAIT; + fi + if [ $Done -eq 1 ]; then + + if [ "$ACTION" = "kill" ]; then + kill ${PID} || kill -3 ${PID} + exit; + + elif [ "$ACTION" = "warn" ]; then + + # go back to monitoring. + + warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}" + Done=0 #go back to monitoring + + elif [ "$ACTION" = "die" ]; then + warn "WARNING: dying without killing process ${PID} on ${SERVER}" + warn "The process details are below: " + warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`" + warn "" + + #should we send email/notify someone? TODO... for now, bail. + + exit -1; + + fi + else + : + #warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded"; + fi +done + diff --git a/src/rocksdb/tools/reduce_levels_test.cc b/src/rocksdb/tools/reduce_levels_test.cc new file mode 100644 index 000000000..c8604bf43 --- /dev/null +++ b/src/rocksdb/tools/reduce_levels_test.cc @@ -0,0 +1,222 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#ifndef ROCKSDB_LITE + +#include "db/db_impl/db_impl.h" +#include "db/version_set.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/ldb_cmd.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "tools/ldb_cmd_impl.h" +#include "util/cast_util.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class ReduceLevelTest : public testing::Test { + public: + ReduceLevelTest() { + dbname_ = test::PerThreadDBPath("db_reduce_levels_test"); + EXPECT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + } + + Status OpenDB(bool create_if_missing, int levels); + + Status Put(const std::string& k, const std::string& v) { + return db_->Put(WriteOptions(), k, v); + } + + std::string Get(const std::string& k) { + ReadOptions options; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + Status Flush() { + if (db_ == nullptr) { + return Status::InvalidArgument("DB not opened."); + } + DBImpl* db_impl = static_cast_with_check<DBImpl>(db_); + return db_impl->TEST_FlushMemTable(); + } + + void MoveL0FileToLevel(int level) { + DBImpl* db_impl = static_cast_with_check<DBImpl>(db_); + for (int i = 0; i < level; ++i) { + ASSERT_OK(db_impl->TEST_CompactRange(i, nullptr, nullptr)); + } + } + + void CloseDB() { + if (db_ != nullptr) { + delete db_; + db_ = nullptr; + } + } + + bool ReduceLevels(int target_level); + + int FilesOnLevel(int level) { + std::string property; + EXPECT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + std::to_string(level), &property)); + return atoi(property.c_str()); + } + + private: + std::string dbname_; + DB* db_; +}; + +Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels) { + ROCKSDB_NAMESPACE::Options opt; + opt.num_levels = num_levels; + opt.create_if_missing = create_if_missing; + ROCKSDB_NAMESPACE::Status st = + ROCKSDB_NAMESPACE::DB::Open(opt, dbname_, &db_); + if (!st.ok()) { + fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str()); + } + return st; +} + +bool ReduceLevelTest::ReduceLevels(int target_level) { + std::vector<std::string> args = + ROCKSDB_NAMESPACE::ReduceDBLevelsCommand::PrepareArgs( + dbname_, target_level, false); + LDBCommand* level_reducer = LDBCommand::InitFromCmdLineArgs( + args, Options(), LDBOptions(), nullptr, LDBCommand::SelectCommand); + level_reducer->Run(); + bool is_succeed = level_reducer->GetExecuteState().IsSucceed(); + delete level_reducer; + return is_succeed; +} + +TEST_F(ReduceLevelTest, Last_Level) { + ASSERT_OK(OpenDB(true, 4)); + ASSERT_OK(Put("aaaa", "11111")); + ASSERT_OK(Flush()); + MoveL0FileToLevel(3); + ASSERT_EQ(FilesOnLevel(3), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(3)); + ASSERT_OK(OpenDB(true, 3)); + ASSERT_EQ(FilesOnLevel(2), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(2)); + ASSERT_OK(OpenDB(true, 2)); + ASSERT_EQ(FilesOnLevel(1), 1); + CloseDB(); +} + +TEST_F(ReduceLevelTest, Top_Level) { + ASSERT_OK(OpenDB(true, 5)); + ASSERT_OK(Put("aaaa", "11111")); + ASSERT_OK(Flush()); + ASSERT_EQ(FilesOnLevel(0), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(4)); + ASSERT_OK(OpenDB(true, 4)); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(3)); + ASSERT_OK(OpenDB(true, 3)); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(2)); + ASSERT_OK(OpenDB(true, 2)); + CloseDB(); +} + +TEST_F(ReduceLevelTest, All_Levels) { + ASSERT_OK(OpenDB(true, 5)); + ASSERT_OK(Put("a", "a11111")); + ASSERT_OK(Flush()); + MoveL0FileToLevel(4); + ASSERT_EQ(FilesOnLevel(4), 1); + CloseDB(); + + ASSERT_OK(OpenDB(true, 5)); + ASSERT_OK(Put("b", "b11111")); + ASSERT_OK(Flush()); + MoveL0FileToLevel(3); + ASSERT_EQ(FilesOnLevel(3), 1); + ASSERT_EQ(FilesOnLevel(4), 1); + CloseDB(); + + ASSERT_OK(OpenDB(true, 5)); + ASSERT_OK(Put("c", "c11111")); + ASSERT_OK(Flush()); + MoveL0FileToLevel(2); + ASSERT_EQ(FilesOnLevel(2), 1); + ASSERT_EQ(FilesOnLevel(3), 1); + ASSERT_EQ(FilesOnLevel(4), 1); + CloseDB(); + + ASSERT_OK(OpenDB(true, 5)); + ASSERT_OK(Put("d", "d11111")); + ASSERT_OK(Flush()); + MoveL0FileToLevel(1); + ASSERT_EQ(FilesOnLevel(1), 1); + ASSERT_EQ(FilesOnLevel(2), 1); + ASSERT_EQ(FilesOnLevel(3), 1); + ASSERT_EQ(FilesOnLevel(4), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(4)); + ASSERT_OK(OpenDB(true, 4)); + ASSERT_EQ("a11111", Get("a")); + ASSERT_EQ("b11111", Get("b")); + ASSERT_EQ("c11111", Get("c")); + ASSERT_EQ("d11111", Get("d")); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(3)); + ASSERT_OK(OpenDB(true, 3)); + ASSERT_EQ("a11111", Get("a")); + ASSERT_EQ("b11111", Get("b")); + ASSERT_EQ("c11111", Get("c")); + ASSERT_EQ("d11111", Get("d")); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(2)); + ASSERT_OK(OpenDB(true, 2)); + ASSERT_EQ("a11111", Get("a")); + ASSERT_EQ("b11111", Get("b")); + ASSERT_EQ("c11111", Get("c")); + ASSERT_EQ("d11111", Get("d")); + CloseDB(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as LDBCommand is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/tools/regression_test.sh b/src/rocksdb/tools/regression_test.sh new file mode 100755 index 000000000..2743c5aee --- /dev/null +++ b/src/rocksdb/tools/regression_test.sh @@ -0,0 +1,477 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# The RocksDB regression test script. +# REQUIREMENT: must be able to run make db_bench in the current directory +# +# This script will do the following things in order: +# +# 1. check out the specified rocksdb commit. +# 2. build db_bench using the specified commit +# 3. setup test directory $TEST_PATH. If not specified, then the test directory +# will be "/tmp/rocksdb/regression_test" +# 4. run set of benchmarks on the specified host +# (can be either locally or remotely) +# 5. generate report in the $RESULT_PATH. If RESULT_PATH is not specified, +# RESULT_PATH will be set to $TEST_PATH/current_time +# +# = Examples = +# * Run the regression test using rocksdb commit abcdef that outputs results +# and temp files in "/my/output/dir" +#r +# TEST_PATH=/my/output/dir COMMIT_ID=abcdef ./tools/regression_test.sh +# +# * Run the regression test on a remost host under "/my/output/dir" directory +# and stores the result locally in "/my/benchmark/results" using commit +# abcdef and with the rocksdb options specified in /my/path/to/OPTIONS-012345 +# with 1000000000 keys in each benchmark in the regression test where each +# key and value are 100 and 900 bytes respectively: +# +# REMOTE_USER_AT_HOST=yhchiang@my.remote.host \ +# TEST_PATH=/my/output/dir \ +# RESULT_PATH=/my/benchmark/results \ +# COMMIT_ID=abcdef \ +# OPTIONS_FILE=/my/path/to/OPTIONS-012345 \ +# NUM_KEYS=1000000000 \ +# KEY_SIZE=100 \ +# VALUE_SIZE=900 \ +# ./tools/regression_test.sh +# +# = Regression test environmental parameters = +# DEBUG: If true, then the script will not build db_bench if db_bench already +# exists +# Default: 0 +# TEST_MODE: If 1, run fillseqdeterminstic and benchmarks both +# if 0, only run fillseqdeterministc +# if 2, only run benchmarks +# Default: 1 +# TEST_PATH: the root directory of the regression test. +# Default: "/tmp/rocksdb/regression_test" +# !!! NOTE !!! - a DB will also be saved in $TEST_PATH/../db +# RESULT_PATH: the directory where the regression results will be generated. +# Default: "$TEST_PATH/current_time" +# REMOTE_USER_AT_HOST: If set, then test will run on the specified host under +# TEST_PATH directory and outputs test results locally in RESULT_PATH +# The REMOTE_USER_AT_HOST should follow the format user-id@host.name +# DB_PATH: the path where the rocksdb database will be created during the +# regression test. Default: $TEST_PATH/db +# WAL_PATH: the path where the rocksdb WAL will be outputed. +# Default: $TEST_PATH/wal +# OPTIONS_FILE: If specified, then the regression test will use the specified +# file to initialize the RocksDB options in its benchmarks. Note that +# this feature only work for commits after 88acd93 or rocksdb version +# later than 4.9. +# DELETE_TEST_PATH: If true, then the test directory will be deleted +# after the script ends. +# Default: 0 +# +# = db_bench parameters = +# NUM_THREADS: The number of concurrent foreground threads that will issue +# database operations in the benchmark. Default: 16. +# NUM_KEYS: The key range that will be used in the entire regression test. +# Default: 1G. +# NUM_OPS: The number of operations (reads, writes, or deletes) that will +# be issued in EACH thread. +# Default: $NUM_KEYS / $NUM_THREADS +# KEY_SIZE: The size of each key in bytes in db_bench. Default: 100. +# VALUE_SIZE: The size of each value in bytes in db_bench. Default: 900. +# CACHE_SIZE: The size of RocksDB block cache used in db_bench. Default: 1G +# STATISTICS: If 1, then statistics is on in db_bench. Default: 0. +# COMPRESSION_RATIO: The compression ratio of the key generated in db_bench. +# Default: 0.5. +# HISTOGRAM: If 1, then the histogram feature on performance feature is on. +# STATS_PER_INTERVAL: If 1, then the statistics will be reported for every +# STATS_INTERVAL_SECONDS seconds. Default 1. +# STATS_INTERVAL_SECONDS: If STATS_PER_INTERVAL is set to 1, then statistics +# will be reported for every STATS_INTERVAL_SECONDS. Default 60. +# MAX_BACKGROUND_FLUSHES: The maxinum number of concurrent flushes in +# db_bench. Default: 4. +# MAX_BACKGROUND_COMPACTIONS: The maximum number of concurrent compactions +# in db_bench. Default: 16. +# NUM_HIGH_PRI_THREADS: The number of high-pri threads available for +# concurrent flushes in db_bench. Default: 4. +# NUM_LOW_PRI_THREADS: The number of low-pri threads available for +# concurrent compactions in db_bench. Default: 16. +# SEEK_NEXTS: Controls how many Next() will be called after seek. +# Default: 10. +# SEED: random seed that controls the randomness of the benchmark. +# Default: $( date +%s ) + +#============================================================================== +# CONSTANT +#============================================================================== +TITLE_FORMAT="%40s,%25s,%30s,%7s,%9s,%8s," +TITLE_FORMAT+="%10s,%13s,%14s,%11s,%12s," +TITLE_FORMAT+="%7s,%11s," +TITLE_FORMAT+="%9s,%10s,%10s,%10s,%10s,%10s,%5s," +TITLE_FORMAT+="%5s,%5s,%5s" # time +TITLE_FORMAT+="\n" + +DATA_FORMAT="%40s,%25s,%30s,%7s,%9s,%8s," +DATA_FORMAT+="%10s,%13.0f,%14s,%11s,%12s," +DATA_FORMAT+="%7s,%11s," +DATA_FORMAT+="%9.0f,%10.0f,%10.0f,%10.0f,%10.0f,%10.0f,%5.0f," +DATA_FORMAT+="%5.0f,%5.0f,%5.0f" # time +DATA_FORMAT+="\n" + +MAIN_PATTERN="$1""[[:blank:]]+:.*[[:blank:]]+([0-9\.]+)[[:blank:]]+ops/sec" +PERC_PATTERN="Percentiles: P50: ([0-9\.]+) P75: ([0-9\.]+) " +PERC_PATTERN+="P99: ([0-9\.]+) P99.9: ([0-9\.]+) P99.99: ([0-9\.]+)" +#============================================================================== + +function main { + TEST_ROOT_DIR=${TEST_PATH:-"/tmp/rocksdb/regression_test"} + init_arguments $TEST_ROOT_DIR + + build_db_bench_and_ldb + + setup_test_directory + if [ $TEST_MODE -le 1 ]; then + test_remote "test -d $ORIGIN_PATH" + if [[ $? -ne 0 ]]; then + echo "Building DB..." + # compactall alone will not print ops or threads, which will fail update_report + run_db_bench "fillseq,compactall" $NUM_KEYS 1 0 0 + # only save for future use on success + test_remote "mv $DB_PATH $ORIGIN_PATH" + fi + fi + if [ $TEST_MODE -ge 1 ]; then + build_checkpoint + run_db_bench "readrandom" + run_db_bench "readwhilewriting" + run_db_bench "deleterandom" + run_db_bench "seekrandom" + run_db_bench "seekrandomwhilewriting" + run_db_bench "multireadrandom" + fi + + cleanup_test_directory $TEST_ROOT_DIR + echo "" + echo "Benchmark completed! Results are available in $RESULT_PATH" +} + +############################################################################ +function init_arguments { + K=1024 + M=$((1024 * K)) + G=$((1024 * M)) + + current_time=$(date +"%F-%H:%M:%S") + RESULT_PATH=${RESULT_PATH:-"$1/results/$current_time"} + COMMIT_ID=`hg id -i 2>/dev/null || git rev-parse HEAD 2>/dev/null || echo 'unknown'` + SUMMARY_FILE="$RESULT_PATH/SUMMARY.csv" + + DB_PATH=${3:-"$1/db"} + ORIGIN_PATH=${ORIGIN_PATH:-"$(dirname $(dirname $DB_PATH))/db"} + WAL_PATH=${4:-""} + if [ -z "$REMOTE_USER_AT_HOST" ]; then + DB_BENCH_DIR=${5:-"."} + else + DB_BENCH_DIR=${5:-"$1/db_bench"} + fi + + DEBUG=${DEBUG:-0} + TEST_MODE=${TEST_MODE:-1} + SCP=${SCP:-"scp"} + SSH=${SSH:-"ssh"} + NUM_THREADS=${NUM_THREADS:-16} + NUM_KEYS=${NUM_KEYS:-$((1 * G))} # key range + NUM_OPS=${NUM_OPS:-$(($NUM_KEYS / $NUM_THREADS))} + KEY_SIZE=${KEY_SIZE:-100} + VALUE_SIZE=${VALUE_SIZE:-900} + CACHE_SIZE=${CACHE_SIZE:-$((1 * G))} + STATISTICS=${STATISTICS:-0} + COMPRESSION_RATIO=${COMPRESSION_RATIO:-0.5} + HISTOGRAM=${HISTOGRAM:-1} + NUM_MULTI_DB=${NUM_MULTI_DB:-1} + STATS_PER_INTERVAL=${STATS_PER_INTERVAL:-1} + STATS_INTERVAL_SECONDS=${STATS_INTERVAL_SECONDS:-600} + MAX_BACKGROUND_FLUSHES=${MAX_BACKGROUND_FLUSHES:-4} + MAX_BACKGROUND_COMPACTIONS=${MAX_BACKGROUND_COMPACTIONS:-16} + NUM_HIGH_PRI_THREADS=${NUM_HIGH_PRI_THREADS:-4} + NUM_LOW_PRI_THREADS=${NUM_LOW_PRI_THREADS:-16} + DELETE_TEST_PATH=${DELETE_TEST_PATH:-0} + SEEK_NEXTS=${SEEK_NEXTS:-10} + SEED=${SEED:-$( date +%s )} + MULTIREAD_BATCH_SIZE=${MULTIREAD_BATCH_SIZE:-128} + MULTIREAD_STRIDE=${MULTIREAD_STRIDE:-12} + PERF_LEVEL=${PERF_LEVEL:-1} +} + +# $1 --- benchmark name +# $2 --- number of operations. Default: $NUM_KEYS +# $3 --- number of threads. Default $NUM_THREADS +# $4 --- use_existing_db. Default: 1 +# $5 --- update_report. Default: 1 +function run_db_bench { + # Make sure no other db_bench is running. (Make sure command succeeds if pidof + # command exists but finds nothing.) + pids_cmd='pidof db_bench || pidof --version > /dev/null' + # But first, make best effort to kill any db_bench that have run for more + # than 12 hours, as that indicates a hung or runaway process. + kill_old_cmd='for PID in $(pidof db_bench); do [ "$(($(stat -c %Y /proc/$PID) + 43200))" -lt "$(date +%s)" ] && echo "Killing old db_bench $PID" && kill $PID && sleep 5 && kill -9 $PID && sleep 5; done; pidof --version > /dev/null' + if ! [ -z "$REMOTE_USER_AT_HOST" ]; then + pids_cmd="$SSH $REMOTE_USER_AT_HOST '$pids_cmd'" + kill_old_cmd="$SSH $REMOTE_USER_AT_HOST '$kill_old_cmd'" + fi + + eval $kill_old_cmd + exit_on_error $? "$kill_old_cmd" + + pids_output="$(eval $pids_cmd)" + exit_on_error $? "$pids_cmd" + + if [ "$pids_output" != "" ]; then + echo "Stopped regression_test.sh as there're still recent db_bench " + echo "processes running: $pids_output" + echo "Clean up test directory" + cleanup_test_directory $TEST_ROOT_DIR + exit 2 + fi + + # Build db_bench command + ops=${2:-$NUM_OPS} + threads=${3:-$NUM_THREADS} + USE_EXISTING_DB=${4:-1} + UPDATE_REPORT=${5:-1} + echo "" + echo "=======================================================================" + echo "Benchmark $1" + echo "=======================================================================" + echo "" + db_bench_error=0 + options_file_arg=$(setup_options_file) + echo "$options_file_arg" + # use `which time` to avoid using bash's internal time command + db_bench_cmd="\$(which time) -p $DB_BENCH_DIR/db_bench \ + --benchmarks=$1 --db=$DB_PATH --wal_dir=$WAL_PATH \ + --use_existing_db=$USE_EXISTING_DB \ + --perf_level=$PERF_LEVEL \ + --disable_auto_compactions \ + --threads=$threads \ + --num=$NUM_KEYS \ + --reads=$ops \ + --writes=$ops \ + --deletes=$ops \ + --key_size=$KEY_SIZE \ + --value_size=$VALUE_SIZE \ + --cache_size=$CACHE_SIZE \ + --statistics=$STATISTICS \ + $options_file_arg \ + --compression_ratio=$COMPRESSION_RATIO \ + --histogram=$HISTOGRAM \ + --seek_nexts=$SEEK_NEXTS \ + --stats_per_interval=$STATS_PER_INTERVAL \ + --stats_interval_seconds=$STATS_INTERVAL_SECONDS \ + --max_background_flushes=$MAX_BACKGROUND_FLUSHES \ + --num_multi_db=$NUM_MULTI_DB \ + --max_background_compactions=$MAX_BACKGROUND_COMPACTIONS \ + --num_high_pri_threads=$NUM_HIGH_PRI_THREADS \ + --num_low_pri_threads=$NUM_LOW_PRI_THREADS \ + --seed=$SEED \ + --multiread_batched=true \ + --batch_size=$MULTIREAD_BATCH_SIZE \ + --multiread_stride=$MULTIREAD_STRIDE 2>&1" + if ! [ -z "$REMOTE_USER_AT_HOST" ]; then + echo "Running benchmark remotely on $REMOTE_USER_AT_HOST" + db_bench_cmd="$SSH $REMOTE_USER_AT_HOST '$db_bench_cmd'" + fi + echo db_bench_cmd="$db_bench_cmd" + + # Run the db_bench command + eval $db_bench_cmd | tee -a "$RESULT_PATH/$1" + exit_on_error ${PIPESTATUS[0]} db_bench + if [ $UPDATE_REPORT -ne 0 ]; then + update_report "$1" "$RESULT_PATH/$1" $ops $threads + fi +} + +function build_checkpoint { + cmd_prefix="" + if ! [ -z "$REMOTE_USER_AT_HOST" ]; then + cmd_prefix="$SSH $REMOTE_USER_AT_HOST " + fi + if [ $NUM_MULTI_DB -gt 1 ]; then + dirs=$($cmd_prefix find $ORIGIN_PATH -type d -links 2) + for dir in $dirs; do + db_index=$(basename $dir) + echo "Building checkpoints: $ORIGIN_PATH/$db_index -> $DB_PATH/$db_index ..." + $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH/$db_index \ + --db=$ORIGIN_PATH/$db_index --try_load_options 2>&1 + exit_on_error $? + done + else + # checkpoint cannot build in directory already exists + $cmd_prefix rm -rf $DB_PATH + echo "Building checkpoint: $ORIGIN_PATH -> $DB_PATH ..." + $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH \ + --db=$ORIGIN_PATH --try_load_options 2>&1 + exit_on_error $? + fi +} + +function multiply { + echo "$1 * $2" | bc +} + +# $1 --- name of the benchmark +# $2 --- the filename of the output log of db_bench +function update_report { + main_result=`cat $2 | grep $1` + exit_on_error $? + perc_statement=`cat $2 | grep Percentile` + exit_on_error $? + + # Obtain micros / op + + [[ $main_result =~ $MAIN_PATTERN ]] + ops_per_s=${BASH_REMATCH[1]} + + # Obtain percentile information + [[ $perc_statement =~ $PERC_PATTERN ]] + perc[0]=${BASH_REMATCH[1]} # p50 + perc[1]=${BASH_REMATCH[2]} # p75 + perc[2]=${BASH_REMATCH[3]} # p99 + perc[3]=${BASH_REMATCH[4]} # p99.9 + perc[4]=${BASH_REMATCH[5]} # p99.99 + + # Parse the output of the time command + real_sec=`tail -3 $2 | grep real | awk '{print $2}'` + user_sec=`tail -3 $2 | grep user | awk '{print $2}'` + sys_sec=`tail -3 $2 | grep sys | awk '{print $2}'` + + (printf "$DATA_FORMAT" \ + $COMMIT_ID $1 $REMOTE_USER_AT_HOST $NUM_MULTI_DB $NUM_KEYS $KEY_SIZE $VALUE_SIZE \ + $(multiply $COMPRESSION_RATIO 100) \ + $3 $4 $CACHE_SIZE \ + $MAX_BACKGROUND_FLUSHES $MAX_BACKGROUND_COMPACTIONS \ + $ops_per_s \ + $(multiply ${perc[0]} 1000) \ + $(multiply ${perc[1]} 1000) \ + $(multiply ${perc[2]} 1000) \ + $(multiply ${perc[3]} 1000) \ + $(multiply ${perc[4]} 1000) \ + $DEBUG \ + $real_sec \ + $user_sec \ + $sys_sec \ + >> $SUMMARY_FILE) + exit_on_error $? +} + +function exit_on_error { + if [ $1 -ne 0 ]; then + echo "" + echo "ERROR: Benchmark did not complete successfully." + if ! [ -z "$2" ]; then + echo "Failure command: $2" + fi + echo "Partial results are output to $RESULT_PATH" + echo "ERROR" >> $SUMMARY_FILE + exit $1 + fi +} + +function build_db_bench_and_ldb { + echo "Building db_bench & ldb ..." + + make clean + exit_on_error $? + + DEBUG_LEVEL=0 make db_bench ldb -j32 + exit_on_error $? +} + +function run_remote { + test_remote "$1" + exit_on_error $? "$1" +} + +function test_remote { + if ! [ -z "$REMOTE_USER_AT_HOST" ]; then + cmd="$SSH $REMOTE_USER_AT_HOST '$1'" + else + cmd="$1" + fi + eval "$cmd" +} + +function run_local { + eval "$1" + exit_on_error $? "$1" +} + +function setup_options_file { + if ! [ -z "$OPTIONS_FILE" ]; then + if ! [ -z "$REMOTE_USER_AT_HOST" ]; then + options_file="$DB_BENCH_DIR/OPTIONS_FILE" + run_local "$SCP $OPTIONS_FILE $REMOTE_USER_AT_HOST:$options_file" + else + options_file="$OPTIONS_FILE" + fi + echo "--options_file=$options_file" + fi + echo "" +} + +function setup_test_directory { + echo "Deleting old regression test directories and creating new ones" + + run_local 'test "$DB_PATH" != "."' + run_remote "rm -rf $DB_PATH" + + if [ "$DB_BENCH_DIR" != "." ]; then + run_remote "rm -rf $DB_BENCH_DIR" + fi + + run_local 'test "$RESULT_PATH" != "."' + run_local "rm -rf $RESULT_PATH" + + if ! [ -z "$WAL_PATH" ]; then + run_remote "rm -rf $WAL_PATH" + run_remote "mkdir -p $WAL_PATH" + fi + + run_remote "mkdir -p $DB_PATH" + + run_remote "mkdir -p $DB_BENCH_DIR" + run_remote "ls -l $DB_BENCH_DIR" + + if ! [ -z "$REMOTE_USER_AT_HOST" ]; then + run_local "$SCP ./db_bench $REMOTE_USER_AT_HOST:$DB_BENCH_DIR/db_bench" + run_local "$SCP ./ldb $REMOTE_USER_AT_HOST:$DB_BENCH_DIR/ldb" + fi + + run_local "mkdir -p $RESULT_PATH" + + (printf $TITLE_FORMAT \ + "commit id" "benchmark" "user@host" "num-dbs" "key-range" "key-size" \ + "value-size" "compress-rate" "ops-per-thread" "num-threads" "cache-size" \ + "flushes" "compactions" \ + "ops-per-s" "p50" "p75" "p99" "p99.9" "p99.99" "debug" \ + "real-sec" "user-sec" "sys-sec" \ + >> $SUMMARY_FILE) + exit_on_error $? +} + +function cleanup_test_directory { + + if [ $DELETE_TEST_PATH -ne 0 ]; then + echo "Clear old regression test directories and creating new ones" + run_remote "rm -rf $DB_PATH" + run_remote "rm -rf $WAL_PATH" + if ! [ -z "$REMOTE_USER_AT_HOST" ]; then + run_remote "rm -rf $DB_BENCH_DIR" + fi + run_remote "rm -rf $1" + else + echo "------------ DEBUG MODE ------------" + echo "DB PATH: $DB_PATH" + echo "WAL PATH: $WAL_PATH" + fi +} + +############################################################################ + +# shellcheck disable=SC2068 +main $@ diff --git a/src/rocksdb/tools/restore_db.sh b/src/rocksdb/tools/restore_db.sh new file mode 100755 index 000000000..ed89794b2 --- /dev/null +++ b/src/rocksdb/tools/restore_db.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# + +if [ "$#" -lt 2 ]; then + echo "usage: ${BASH_SOURCE[0]} <Backup Dir> <DB Path>" + exit 1 +fi + +backup_dir="$1" +db_dir="$2" + +echo "== Restoring latest from $backup_dir to $db_dir" +./ldb restore --db="$db_dir" --backup_dir="$backup_dir" diff --git a/src/rocksdb/tools/rocksdb_dump_test.sh b/src/rocksdb/tools/rocksdb_dump_test.sh new file mode 100755 index 000000000..532c53267 --- /dev/null +++ b/src/rocksdb/tools/rocksdb_dump_test.sh @@ -0,0 +1,9 @@ +# shellcheck disable=SC2148 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +TESTDIR=`mktemp -d ${TMPDIR:-/tmp}/rocksdb-dump-test.XXXXX` +DUMPFILE="tools/sample-dump.dmp" + +# Verify that the sample dump file is undumpable and then redumpable. +./rocksdb_undump --dump_location=$DUMPFILE --db_path=$TESTDIR/db +./rocksdb_dump --anonymous --db_path=$TESTDIR/db --dump_location=$TESTDIR/dump +cmp $DUMPFILE $TESTDIR/dump diff --git a/src/rocksdb/tools/run_blob_bench.sh b/src/rocksdb/tools/run_blob_bench.sh new file mode 100755 index 000000000..3755a9e56 --- /dev/null +++ b/src/rocksdb/tools/run_blob_bench.sh @@ -0,0 +1,223 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# BlobDB benchmark script +# +# REQUIRES: benchmark.sh is in the tools subdirectory +# +# After the execution of this script, log files are available in $output_dir. +# report.tsv provides high level statistics. +# +# Should be run from the parent of the tools directory. The command line is: +# [$env_vars] tools/run_blob_bench.sh +# +# This runs the following sequence of BlobDB performance tests: +# phase 1) write-only - bulkload+compact, overwrite+waitforcompaction +# phase 2) read-write - readwhilewriting, fwdrangewhilewriting +# phase 3) read-only - readrandom, fwdrange +# + +# Exit Codes +EXIT_INVALID_ARGS=1 + +# Size constants +K=1024 +M=$((1024 * K)) +G=$((1024 * M)) +T=$((1024 * G)) + +function display_usage() { + echo "usage: run_blob_bench.sh [--help]" + echo "" + echo "Runs the following sequence of BlobDB benchmark tests using tools/benchmark.sh:" + echo -e "\tPhase 1: write-only tests: bulkload+compact, overwrite+waitforcompaction" + echo -e "\tPhase 2: read-write tests: readwhilewriting, fwdrangewhilewriting" + echo -e "\tPhase 3: read-only tests: readrandom, fwdrange" + echo "" + echo "Environment Variables:" + echo -e "\tJOB_ID\t\t\t\tIdentifier for the benchmark job, will appear in the results (default: empty)" + echo -e "\tDB_DIR\t\t\t\tPath for the RocksDB data directory (mandatory)" + echo -e "\tWAL_DIR\t\t\t\tPath for the RocksDB WAL directory (mandatory)" + echo -e "\tOUTPUT_DIR\t\t\tPath for the benchmark results (mandatory)" + echo -e "\tNUM_THREADS\t\t\tNumber of threads (default: 16)" + echo -e "\tCOMPRESSION_TYPE\t\tCompression type for the SST files (default: lz4)" + echo -e "\tDB_SIZE\t\t\t\tRaw (uncompressed) database size (default: 1 TB)" + echo -e "\tVALUE_SIZE\t\t\tValue size (default: 1 KB)" + echo -e "\tNUM_KEYS\t\t\tNumber of keys (default: raw database size divided by value size)" + echo -e "\tDURATION\t\t\tIndividual duration for read-write/read-only tests in seconds (default: 1800)" + echo -e "\tWRITE_BUFFER_SIZE\t\tWrite buffer (memtable) size (default: 1 GB)" + echo -e "\tENABLE_BLOB_FILES\t\tEnable blob files (default: 1)" + echo -e "\tMIN_BLOB_SIZE\t\t\tSize threshold for storing values in blob files (default: 0)" + echo -e "\tBLOB_FILE_SIZE\t\t\tBlob file size (default: same as write buffer size)" + echo -e "\tBLOB_COMPRESSION_TYPE\t\tCompression type for the blob files (default: lz4)" + echo -e "\tENABLE_BLOB_GC\t\t\tEnable blob garbage collection (default: 1)" + echo -e "\tBLOB_GC_AGE_CUTOFF\t\tBlob garbage collection age cutoff (default: 0.25)" + echo -e "\tBLOB_GC_FORCE_THRESHOLD\t\tThreshold for forcing garbage collection of the oldest blob files (default: 1.0)" + echo -e "\tBLOB_COMPACTION_READAHEAD_SIZE\tBlob compaction readahead size (default: 0)" + echo -e "\tBLOB_FILE_STARTING_LEVEL\t\tBlob file starting level (default: 0)" + echo -e "\tUSE_BLOB_CACHE\t\t\tEnable blob cache. (default: 1)" + echo -e "\tUSE_SHARED_BLOCK_AND_BLOB_CACHE\t\t\tUse the same backing cache for block cache and blob cache. (default: 1)" + echo -e "\tBLOB_CACHE_SIZE\t\t\tSize of the blob cache (default: 16GB)" + echo -e "\tBLOB_CACHE_NUMSHARDBITS\t\t\tNumber of shards for the blob cache is 2 ** blob_cache_numshardbits (default: 6)" + echo -e "\tPREPOPULATE_BLOB_CACHE\t\t\tPre-populate hot/warm blobs in blob cache (default: 0)" + echo -e "\tTARGET_FILE_SIZE_BASE\t\tTarget SST file size for compactions (default: write buffer size, scaled down if blob files are enabled)" + echo -e "\tMAX_BYTES_FOR_LEVEL_BASE\tMaximum size for the base level (default: 8 * target SST file size)" +} + +if [ $# -ge 1 ]; then + display_usage + + if [ "$1" == "--help" ]; then + exit + else + exit $EXIT_INVALID_ARGS + fi +fi + +# shellcheck disable=SC2153 +if [ -z "$DB_DIR" ]; then + echo "DB_DIR is not defined" + exit $EXIT_INVALID_ARGS +fi + +# shellcheck disable=SC2153 +if [ -z "$WAL_DIR" ]; then + echo "WAL_DIR is not defined" + exit $EXIT_INVALID_ARGS +fi + +# shellcheck disable=SC2153 +if [ -z "$OUTPUT_DIR" ]; then + echo "OUTPUT_DIR is not defined" + exit $EXIT_INVALID_ARGS +fi + +# shellcheck disable=SC2153 +job_id=$JOB_ID + +db_dir=$DB_DIR +wal_dir=$WAL_DIR +output_dir=$OUTPUT_DIR + +num_threads=${NUM_THREADS:-16} + +compression_type=${COMPRESSION_TYPE:-lz4} + +db_size=${DB_SIZE:-$((1 * T))} +value_size=${VALUE_SIZE:-$((1 * K))} +num_keys=${NUM_KEYS:-$((db_size / value_size))} + +duration=${DURATION:-1800} + +write_buffer_size=${WRITE_BUFFER_SIZE:-$((1 * G))} + +enable_blob_files=${ENABLE_BLOB_FILES:-1} +min_blob_size=${MIN_BLOB_SIZE:-0} +blob_file_size=${BLOB_FILE_SIZE:-$write_buffer_size} +blob_compression_type=${BLOB_COMPRESSION_TYPE:-lz4} +enable_blob_garbage_collection=${ENABLE_BLOB_GC:-1} +blob_garbage_collection_age_cutoff=${BLOB_GC_AGE_CUTOFF:-0.25} +blob_garbage_collection_force_threshold=${BLOB_GC_FORCE_THRESHOLD:-1.0} +blob_compaction_readahead_size=${BLOB_COMPACTION_READAHEAD_SIZE:-0} +blob_file_starting_level=${BLOB_FILE_STARTING_LEVEL:-0} +use_blob_cache=${USE_BLOB_CACHE:-1} +use_shared_block_and_blob_cache=${USE_SHARED_BLOCK_AND_BLOB_CACHE:-1} +blob_cache_size=${BLOB_CACHE_SIZE:-$((16 * G))} +blob_cache_numshardbits=${BLOB_CACHE_NUMSHARDBITS:-6} +prepopulate_blob_cache=${PREPOPULATE_BLOB_CACHE:-0} + +if [ "$enable_blob_files" == "1" ]; then + target_file_size_base=${TARGET_FILE_SIZE_BASE:-$((32 * write_buffer_size / value_size))} +else + target_file_size_base=${TARGET_FILE_SIZE_BASE:-$write_buffer_size} +fi + +max_bytes_for_level_base=${MAX_BYTES_FOR_LEVEL_BASE:-$((8 * target_file_size_base))} + +echo "======================== Benchmark setup ========================" +echo -e "Job ID:\t\t\t\t\t$job_id" +echo -e "Data directory:\t\t\t\t$db_dir" +echo -e "WAL directory:\t\t\t\t$wal_dir" +echo -e "Output directory:\t\t\t$output_dir" +echo -e "Number of threads:\t\t\t$num_threads" +echo -e "Compression type for SST files:\t\t$compression_type" +echo -e "Raw database size:\t\t\t$db_size" +echo -e "Value size:\t\t\t\t$value_size" +echo -e "Number of keys:\t\t\t\t$num_keys" +echo -e "Duration of read-write/read-only tests:\t$duration" +echo -e "Write buffer size:\t\t\t$write_buffer_size" +echo -e "Blob files enabled:\t\t\t$enable_blob_files" +echo -e "Blob size threshold:\t\t\t$min_blob_size" +echo -e "Blob file size:\t\t\t\t$blob_file_size" +echo -e "Compression type for blob files:\t$blob_compression_type" +echo -e "Blob GC enabled:\t\t\t$enable_blob_garbage_collection" +echo -e "Blob GC age cutoff:\t\t\t$blob_garbage_collection_age_cutoff" +echo -e "Blob GC force threshold:\t\t$blob_garbage_collection_force_threshold" +echo -e "Blob compaction readahead size:\t\t$blob_compaction_readahead_size" +echo -e "Blob file starting level:\t\t$blob_file_starting_level" +echo -e "Blob cache enabled:\t\t\t$use_blob_cache" +echo -e "Blob cache and block cache shared:\t\t\t$use_shared_block_and_blob_cache" +echo -e "Blob cache size:\t\t$blob_cache_size" +echo -e "Blob cache number of shard bits:\t\t$blob_cache_numshardbits" +echo -e "Blob cache prepopulated:\t\t\t$prepopulate_blob_cache" +echo -e "Target SST file size:\t\t\t$target_file_size_base" +echo -e "Maximum size of base level:\t\t$max_bytes_for_level_base" +echo "=================================================================" + +rm -rf "$db_dir" +rm -rf "$wal_dir" +rm -rf "$output_dir" + +ENV_VARS="\ + JOB_ID=$job_id \ + DB_DIR=$db_dir \ + WAL_DIR=$wal_dir \ + OUTPUT_DIR=$output_dir \ + NUM_THREADS=$num_threads \ + COMPRESSION_TYPE=$compression_type \ + VALUE_SIZE=$value_size \ + NUM_KEYS=$num_keys" + +ENV_VARS_D="$ENV_VARS DURATION=$duration" + +PARAMS="\ + --enable_blob_files=$enable_blob_files \ + --min_blob_size=$min_blob_size \ + --blob_file_size=$blob_file_size \ + --blob_compression_type=$blob_compression_type \ + --blob_file_starting_level=$blob_file_starting_level \ + --use_blob_cache=$use_blob_cache \ + --use_shared_block_and_blob_cache=$use_shared_block_and_blob_cache \ + --blob_cache_size=$blob_cache_size \ + --blob_cache_numshardbits=$blob_cache_numshardbits \ + --prepopulate_blob_cache=$prepopulate_blob_cache \ + --write_buffer_size=$write_buffer_size \ + --target_file_size_base=$target_file_size_base \ + --max_bytes_for_level_base=$max_bytes_for_level_base" + +PARAMS_GC="$PARAMS \ + --enable_blob_garbage_collection=$enable_blob_garbage_collection \ + --blob_garbage_collection_age_cutoff=$blob_garbage_collection_age_cutoff \ + --blob_garbage_collection_force_threshold=$blob_garbage_collection_force_threshold \ + --blob_compaction_readahead_size=$blob_compaction_readahead_size" + +# bulk load (using fillrandom) + compact +env -u DURATION -S "$ENV_VARS" ./tools/benchmark.sh bulkload "$PARAMS" + +# overwrite + waitforcompaction +env -u DURATION -S "$ENV_VARS" ./tools/benchmark.sh overwrite "$PARAMS_GC" + +# readwhilewriting +env -S "$ENV_VARS_D" ./tools/benchmark.sh readwhilewriting "$PARAMS_GC" + +# fwdrangewhilewriting +env -S "$ENV_VARS_D" ./tools/benchmark.sh fwdrangewhilewriting "$PARAMS_GC" + +# readrandom +env -S "$ENV_VARS_D" ./tools/benchmark.sh readrandom "$PARAMS_GC" + +# fwdrange +env -S "$ENV_VARS_D" ./tools/benchmark.sh fwdrange "$PARAMS_GC" + +# save logs to output directory +cp "$db_dir"/LOG* "$output_dir/" diff --git a/src/rocksdb/tools/run_flash_bench.sh b/src/rocksdb/tools/run_flash_bench.sh new file mode 100755 index 000000000..26e253843 --- /dev/null +++ b/src/rocksdb/tools/run_flash_bench.sh @@ -0,0 +1,359 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# REQUIRE: benchmark.sh exists in the current directory +# After execution of this script, log files are generated in $output_dir. +# report.txt provides a high level statistics + +# This should be run from the parent of the tools directory. The command line is: +# [$env_vars] tools/run_flash_bench.sh [list-of-threads] +# +# This runs a sequence of tests in the following sequence: +# step 1) load - bulkload, compact, fillseq, overwrite +# step 2) read-only for each number of threads +# step 3) read-write for each number of threads +# step 4) merge for each number of threads +# +# The list of threads is optional and when not set is equivalent to "24". +# Were list-of-threads specified as "1 2 4" then the tests in steps 2, 3 and +# 4 above would be repeated for 1, 2 and 4 threads. The tests in step 1 are +# only run for 1 thread. + +# Test output is written to $OUTPUT_DIR, currently /tmp/output. The performance +# summary is in $OUTPUT_DIR/report.txt. There is one file in $OUTPUT_DIR per +# test and the tests are listed below. +# +# The environment variables are also optional. The variables are: +# +# NKEYS - number of key/value pairs to load +# BG_MBWRITEPERSEC - write rate limit in MB/second for tests in which +# there is one thread doing writes and stats are +# reported for read threads. "BG" stands for background. +# If this is too large then the non-writer threads can get +# starved. This is used for the "readwhile" tests. +# FG_MBWRITEPERSEC - write rate limit in MB/second for tests like overwrite +# where stats are reported for the write threads. +# NSECONDS - number of seconds for which to run each test in steps 2, +# 3 and 4. There are currently 15 tests in those steps and +# they are repeated for each entry in list-of-threads so +# this variable lets you control the total duration to +# finish the benchmark. +# RANGE_LIMIT - the number of rows to read per range query for tests that +# do range queries. +# VAL_SIZE - the length of the value in the key/value pairs loaded. +# You can estimate the size of the test database from this, +# NKEYS and the compression rate (--compression_ratio) set +# in tools/benchmark.sh +# BLOCK_LENGTH - value for db_bench --block_size +# CACHE_BYTES - the size of the RocksDB block cache in bytes +# DATA_DIR - directory in which to create database files +# LOG_DIR - directory in which to create WAL files, may be the same +# as DATA_DIR +# DO_SETUP - when set to 0 then a backup of the database is copied from +# $DATA_DIR.bak to $DATA_DIR and the load tests from step 1 +# The WAL directory is also copied from a backup if +# DATA_DIR != LOG_DIR. This allows tests from steps 2, 3, 4 +# to be repeated faster. +# SAVE_SETUP - saves a copy of the database at the end of step 1 to +# $DATA_DIR.bak. When LOG_DIR != DATA_DIR then it is copied +# to $LOG_DIR.bak. +# SKIP_LOW_PRI_TESTS - skip some of the tests which aren't crucial for getting +# actionable benchmarking data (look for keywords "bulkload", +# "sync=1", and "while merging"). +# + +# Size constants +K=1024 +M=$((1024 * K)) +G=$((1024 * M)) + +num_keys=${NKEYS:-$((1 * G))} +# write rate for readwhile... tests +bg_mbwps=${BG_MBWRITEPERSEC:-4} +# write rate for tests other than readwhile, 0 means no limit +fg_mbwps=${FG_MBWRITEPERSEC:-0} +duration=${NSECONDS:-$((60 * 60))} +nps=${RANGE_LIMIT:-10} +vs=${VAL_SIZE:-400} +cs=${CACHE_BYTES:-$(( 1 * G ))} +bs=${BLOCK_LENGTH:-8192} + +# If no command line arguments then run for 24 threads. +if [[ $# -eq 0 ]]; then + nthreads=( 24 ) +else + nthreads=( "$@" ) +fi + +for num_thr in "${nthreads[@]}" ; do + echo Will run for $num_thr threads +done + +# Update these parameters before execution !!! +db_dir=${DATA_DIR:-"/tmp/rocksdb/"} +wal_dir=${LOG_DIR:-"/tmp/rocksdb/"} + +do_setup=${DO_SETUP:-1} +save_setup=${SAVE_SETUP:-0} + +# By default we'll run all the tests. Set this to skip a set of tests which +# aren't critical for getting key metrics. +skip_low_pri_tests=${SKIP_LOW_PRI_TESTS:-0} + +if [[ $skip_low_pri_tests == 1 ]]; then + echo "Skipping some non-critical tests because SKIP_LOW_PRI_TESTS is set." +fi + +output_dir="${TMPDIR:-/tmp}/output" + +ARGS="\ +OUTPUT_DIR=$output_dir \ +NUM_KEYS=$num_keys \ +DB_DIR=$db_dir \ +WAL_DIR=$wal_dir \ +VALUE_SIZE=$vs \ +BLOCK_SIZE=$bs \ +CACHE_SIZE=$cs" + +mkdir -p $output_dir +echo -e "ops/sec\tmb/sec\tSize-GB\tL0_GB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tp99.9\tp99.99\tUptime\tStall-time\tStall%\tTest" \ + > $output_dir/report.txt + +# Notes on test sequence: +# step 1) Setup database via sequential fill followed by overwrite to fragment it. +# Done without setting DURATION to make sure that overwrite does $num_keys writes +# step 2) read-only tests for all levels of concurrency requested +# step 3) non read-only tests for all levels of concurrency requested +# step 4) merge tests for all levels of concurrency requested. These must come last. + +###### Setup the database + +if [[ $do_setup != 0 ]]; then + echo Doing setup + + if [[ $skip_low_pri_tests != 1 ]]; then + # Test 1: bulk load + env $ARGS ./tools/benchmark.sh bulkload + fi + + # Test 2a: sequential fill with large values to get peak ingest + # adjust NUM_KEYS given the use of larger values + env $ARGS BLOCK_SIZE=$((1 * M)) VALUE_SIZE=$((32 * K)) NUM_KEYS=$(( num_keys / 64 )) \ + ./tools/benchmark.sh fillseq_disable_wal + + # Test 2b: sequential fill with the configured value size + env $ARGS ./tools/benchmark.sh fillseq_disable_wal + + # Test 2c: same as 2a, but with WAL being enabled. + env $ARGS BLOCK_SIZE=$((1 * M)) VALUE_SIZE=$((32 * K)) NUM_KEYS=$(( num_keys / 64 )) \ + ./tools/benchmark.sh fillseq_enable_wal + + # Test 2d: same as 2b, but with WAL being enabled. + env $ARGS ./tools/benchmark.sh fillseq_enable_wal + + # Test 3: single-threaded overwrite + env $ARGS NUM_THREADS=1 DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh overwrite + +else + echo Restoring from backup + + rm -rf $db_dir + + if [ ! -d ${db_dir}.bak ]; then + echo Database backup does not exist at ${db_dir}.bak + exit -1 + fi + + echo Restore database from ${db_dir}.bak + cp -p -r ${db_dir}.bak $db_dir + + if [[ $db_dir != $wal_dir ]]; then + rm -rf $wal_dir + + if [ ! -d ${wal_dir}.bak ]; then + echo WAL backup does not exist at ${wal_dir}.bak + exit -1 + fi + + echo Restore WAL from ${wal_dir}.bak + cp -p -r ${wal_dir}.bak $wal_dir + fi +fi + +if [[ $save_setup != 0 ]]; then + echo Save database to ${db_dir}.bak + cp -p -r $db_dir ${db_dir}.bak + + if [[ $db_dir != $wal_dir ]]; then + echo Save WAL to ${wal_dir}.bak + cp -p -r $wal_dir ${wal_dir}.bak + fi +fi + +###### Read-only tests + +for num_thr in "${nthreads[@]}" ; do + # Test 4: random read + env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh readrandom + + # Test 5: random range scans + env $ARGS DURATION=$duration NUM_THREADS=$num_thr NUM_NEXTS_PER_SEEK=$nps \ + ./tools/benchmark.sh fwdrange + + # Test 6: random reverse range scans + env $ARGS DURATION=$duration NUM_THREADS=$num_thr NUM_NEXTS_PER_SEEK=$nps \ + ./tools/benchmark.sh revrange +done + +###### Non read-only tests + +for num_thr in "${nthreads[@]}" ; do + # Test 7: overwrite with sync=0 + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \ + DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh overwrite + + if [[ $skip_low_pri_tests != 1 ]]; then + # Test 8: overwrite with sync=1 + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \ + ./tools/benchmark.sh overwrite + fi + + # Test 9: random update with sync=0 + env $ARGS DURATION=$duration NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \ + ./tools/benchmark.sh updaterandom + + if [[ $skip_low_pri_tests != 1 ]]; then + # Test 10: random update with sync=1 + env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh updaterandom + fi + + # Test 11: random read while writing + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \ + DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh readwhilewriting + + # Test 12: range scan while writing + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \ + DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilewriting + + # Test 13: reverse range scan while writing + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \ + DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilewriting +done + +###### Merge tests + +for num_thr in "${nthreads[@]}" ; do + # Test 14: random merge with sync=0 + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \ + DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh mergerandom + + if [[ $skip_low_pri_tests != 1 ]]; then + # Test 15: random merge with sync=1 + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \ + ./tools/benchmark.sh mergerandom + + # Test 16: random read while merging + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \ + DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh readwhilemerging + + # Test 17: range scan while merging + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \ + DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilemerging + + # Test 18: reverse range scan while merging + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \ + DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilemerging + fi +done + +###### Universal compaction tests. + +# Use a single thread to reduce the variability in the benchmark. +env $ARGS COMPACTION_TEST=1 NUM_THREADS=1 ./tools/benchmark.sh universal_compaction + +if [[ $skip_low_pri_tests != 1 ]]; then + echo bulkload > $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep bulkload $output_dir/report.txt >> $output_dir/report2.txt +fi + +echo fillseq_wal_disabled >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep fillseq.wal_disabled $output_dir/report.txt >> $output_dir/report2.txt + +echo fillseq_wal_enabled >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep fillseq.wal_enabled $output_dir/report.txt >> $output_dir/report2.txt + +echo overwrite sync=0 >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep overwrite $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt + +if [[ $skip_low_pri_tests != 1 ]]; then + echo overwrite sync=1 >> $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep overwrite $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt +fi + +echo updaterandom sync=0 >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep updaterandom $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt + +if [[ $skip_low_pri_tests != 1 ]]; then + echo updaterandom sync=1 >> $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep updaterandom $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt +fi + +echo mergerandom sync=0 >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep mergerandom $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt + +if [[ $skip_low_pri_tests != 1 ]]; then + echo mergerandom sync=1 >> $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep mergerandom $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt +fi + +echo readrandom >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep readrandom $output_dir/report.txt >> $output_dir/report2.txt + +echo fwdrange >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep fwdrange\.t $output_dir/report.txt >> $output_dir/report2.txt + +echo revrange >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep revrange\.t $output_dir/report.txt >> $output_dir/report2.txt + +echo readwhile >> $output_dir/report2.txt >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep readwhilewriting $output_dir/report.txt >> $output_dir/report2.txt + +if [[ $skip_low_pri_tests != 1 ]]; then + echo readwhile >> $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep readwhilemerging $output_dir/report.txt >> $output_dir/report2.txt +fi + +echo fwdreadwhilewriting >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep fwdrangewhilewriting $output_dir/report.txt >> $output_dir/report2.txt + +if [[ $skip_low_pri_tests != 1 ]]; then + echo fwdreadwhilemerging >> $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep fwdrangewhilemerg $output_dir/report.txt >> $output_dir/report2.txt +fi + +echo revreadwhilewriting >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep revrangewhilewriting $output_dir/report.txt >> $output_dir/report2.txt + +if [[ $skip_low_pri_tests != 1 ]]; then + echo revreadwhilemerging >> $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep revrangewhilemerg $output_dir/report.txt >> $output_dir/report2.txt +fi + +cat $output_dir/report2.txt diff --git a/src/rocksdb/tools/run_leveldb.sh b/src/rocksdb/tools/run_leveldb.sh new file mode 100755 index 000000000..2fce8b12d --- /dev/null +++ b/src/rocksdb/tools/run_leveldb.sh @@ -0,0 +1,175 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# REQUIRE: benchmark_leveldb.sh exists in the current directory +# After execution of this script, log files are generated in $output_dir. +# report.txt provides a high level statistics +# +# This should be used with the LevelDB fork listed here to use additional test options. +# For more details on the changes see the blog post listed below. +# https://github.com/mdcallag/leveldb-1 +# http://smalldatum.blogspot.com/2015/04/comparing-leveldb-and-rocksdb-take-2.html +# +# This should be run from the parent of the tools directory. The command line is: +# [$env_vars] tools/run_flash_bench.sh [list-of-threads] +# +# This runs a sequence of tests in the following sequence: +# step 1) load - bulkload, compact, fillseq, overwrite +# step 2) read-only for each number of threads +# step 3) read-write for each number of threads +# +# The list of threads is optional and when not set is equivalent to "24". +# Were list-of-threads specified as "1 2 4" then the tests in steps 2, 3 and +# 4 above would be repeated for 1, 2 and 4 threads. The tests in step 1 are +# only run for 1 thread. + +# Test output is written to $OUTPUT_DIR, currently /tmp/output. The performance +# summary is in $OUTPUT_DIR/report.txt. There is one file in $OUTPUT_DIR per +# test and the tests are listed below. +# +# The environment variables are also optional. The variables are: +# NKEYS - number of key/value pairs to load +# NWRITESPERSEC - the writes/second rate limit for the *whilewriting* tests. +# If this is too large then the non-writer threads can get +# starved. +# VAL_SIZE - the length of the value in the key/value pairs loaded. +# You can estimate the size of the test database from this, +# NKEYS and the compression rate (--compression_ratio) set +# in tools/benchmark_leveldb.sh +# BLOCK_LENGTH - value for db_bench --block_size +# CACHE_BYTES - the size of the RocksDB block cache in bytes +# DATA_DIR - directory in which to create database files +# DO_SETUP - when set to 0 then a backup of the database is copied from +# $DATA_DIR.bak to $DATA_DIR and the load tests from step 1 +# This allows tests from steps 2, 3 to be repeated faster. +# SAVE_SETUP - saves a copy of the database at the end of step 1 to +# $DATA_DIR.bak. + +# Size constants +K=1024 +M=$((1024 * K)) +G=$((1024 * M)) + +num_keys=${NKEYS:-$((1 * G))} +wps=${NWRITESPERSEC:-$((10 * K))} +vs=${VAL_SIZE:-400} +cs=${CACHE_BYTES:-$(( 1 * G ))} +bs=${BLOCK_LENGTH:-4096} + +# If no command line arguments then run for 24 threads. +if [[ $# -eq 0 ]]; then + nthreads=( 24 ) +else + nthreads=( "$@" ) +fi + +for num_thr in "${nthreads[@]}" ; do + echo Will run for $num_thr threads +done + +# Update these parameters before execution !!! +db_dir=${DATA_DIR:-"/tmp/rocksdb/"} + +do_setup=${DO_SETUP:-1} +save_setup=${SAVE_SETUP:-0} + +output_dir="${TMPDIR:-/tmp}/output" + +ARGS="\ +OUTPUT_DIR=$output_dir \ +NUM_KEYS=$num_keys \ +DB_DIR=$db_dir \ +VALUE_SIZE=$vs \ +BLOCK_SIZE=$bs \ +CACHE_SIZE=$cs" + +mkdir -p $output_dir +echo -e "ops/sec\tmb/sec\tusec/op\tavg\tp50\tTest" \ + > $output_dir/report.txt + +# Notes on test sequence: +# step 1) Setup database via sequential fill followed by overwrite to fragment it. +# Done without setting DURATION to make sure that overwrite does $num_keys writes +# step 2) read-only tests for all levels of concurrency requested +# step 3) non read-only tests for all levels of concurrency requested + +###### Setup the database + +if [[ $do_setup != 0 ]]; then + echo Doing setup + + # Test 2a: sequential fill with large values to get peak ingest + # adjust NUM_KEYS given the use of larger values + env $ARGS BLOCK_SIZE=$((1 * M)) VALUE_SIZE=$((32 * K)) NUM_KEYS=$(( num_keys / 64 )) \ + ./tools/benchmark_leveldb.sh fillseq + + # Test 2b: sequential fill with the configured value size + env $ARGS ./tools/benchmark_leveldb.sh fillseq + + # Test 3: single-threaded overwrite + env $ARGS NUM_THREADS=1 DB_BENCH_NO_SYNC=1 ./tools/benchmark_leveldb.sh overwrite + +else + echo Restoring from backup + + rm -rf $db_dir + + if [ ! -d ${db_dir}.bak ]; then + echo Database backup does not exist at ${db_dir}.bak + exit -1 + fi + + echo Restore database from ${db_dir}.bak + cp -p -r ${db_dir}.bak $db_dir +fi + +if [[ $save_setup != 0 ]]; then + echo Save database to ${db_dir}.bak + cp -p -r $db_dir ${db_dir}.bak +fi + +###### Read-only tests + +for num_thr in "${nthreads[@]}" ; do + # Test 4: random read + env $ARGS NUM_THREADS=$num_thr ./tools/benchmark_leveldb.sh readrandom + +done + +###### Non read-only tests + +for num_thr in "${nthreads[@]}" ; do + # Test 7: overwrite with sync=0 + env $ARGS NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \ + ./tools/benchmark_leveldb.sh overwrite + + # Test 8: overwrite with sync=1 + # Not run for now because LevelDB db_bench doesn't have an option to limit the + # test run to X seconds and doing sync-per-commit for --num can take too long. + # env $ARGS NUM_THREADS=$num_thr ./tools/benchmark_leveldb.sh overwrite + + # Test 11: random read while writing + env $ARGS NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \ + ./tools/benchmark_leveldb.sh readwhilewriting + +done + +echo bulkload > $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep bulkload $output_dir/report.txt >> $output_dir/report2.txt +echo fillseq >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep fillseq $output_dir/report.txt >> $output_dir/report2.txt +echo overwrite sync=0 >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep overwrite $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt +echo overwrite sync=1 >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep overwrite $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt +echo readrandom >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep readrandom $output_dir/report.txt >> $output_dir/report2.txt +echo readwhile >> $output_dir/report2.txt >> $output_dir/report2.txt +head -1 $output_dir/report.txt >> $output_dir/report2.txt +grep readwhilewriting $output_dir/report.txt >> $output_dir/report2.txt + +cat $output_dir/report2.txt diff --git a/src/rocksdb/tools/sample-dump.dmp b/src/rocksdb/tools/sample-dump.dmp Binary files differnew file mode 100644 index 000000000..4ec3a7732 --- /dev/null +++ b/src/rocksdb/tools/sample-dump.dmp diff --git a/src/rocksdb/tools/simulated_hybrid_file_system.cc b/src/rocksdb/tools/simulated_hybrid_file_system.cc new file mode 100644 index 000000000..a474417c7 --- /dev/null +++ b/src/rocksdb/tools/simulated_hybrid_file_system.cc @@ -0,0 +1,245 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/stop_watch.h" +#ifndef ROCKSDB_LITE + +#include <algorithm> +#include <sstream> +#include <string> + +#include "rocksdb/rate_limiter.h" +#include "tools/simulated_hybrid_file_system.h" + +namespace ROCKSDB_NAMESPACE { + +const int64_t kUsPerSec = 1000000; +const int64_t kDummyBytesPerUs = 1024; + +namespace { +// From bytes to read/write, calculate service time needed by an HDD. +// This is used to simulate latency from HDD. +int CalculateServeTimeUs(size_t bytes) { + return 12200 + static_cast<int>(static_cast<double>(bytes) * 0.005215); +} + +// There is a bug in rater limiter that would crash with small requests +// Hack to get it around. +void RateLimiterRequest(RateLimiter* rater_limiter, int64_t amount) { + int64_t left = amount * kDummyBytesPerUs; + const int64_t kMaxToRequest = kDummyBytesPerUs * kUsPerSec / 1024; + while (left > 0) { + int64_t to_request = std::min(kMaxToRequest, left); + rater_limiter->Request(to_request, Env::IOPriority::IO_LOW, nullptr); + left -= to_request; + } +} +} // namespace + +// The metadata file format: each line is a full filename of a file which is +// warm +SimulatedHybridFileSystem::SimulatedHybridFileSystem( + const std::shared_ptr<FileSystem>& base, + const std::string& metadata_file_name, int throughput_multiplier, + bool is_full_fs_warm) + : FileSystemWrapper(base), + // Limit to 100 requests per second. + rate_limiter_(NewGenericRateLimiter( + int64_t{throughput_multiplier} * kDummyBytesPerUs * + kUsPerSec /* rate_bytes_per_sec */, + 1000 /* refill_period_us */)), + metadata_file_name_(metadata_file_name), + name_("SimulatedHybridFileSystem: " + std::string(target()->Name())), + is_full_fs_warm_(is_full_fs_warm) { + IOStatus s = base->FileExists(metadata_file_name, IOOptions(), nullptr); + if (s.IsNotFound()) { + return; + } + std::string metadata; + s = ReadFileToString(base.get(), metadata_file_name, &metadata); + if (!s.ok()) { + fprintf(stderr, "Error reading from file %s: %s", + metadata_file_name.c_str(), s.ToString().c_str()); + // Exit rather than assert as this file system is built to run with + // benchmarks, which usually run on release mode. + std::exit(1); + } + std::istringstream input; + input.str(metadata); + std::string line; + while (std::getline(input, line)) { + fprintf(stderr, "Warm file %s\n", line.c_str()); + warm_file_set_.insert(line); + } +} + +// Need to write out the metadata file to file. See comment of +// SimulatedHybridFileSystem::SimulatedHybridFileSystem() for format of the +// file. +SimulatedHybridFileSystem::~SimulatedHybridFileSystem() { + if (metadata_file_name_.empty()) { + return; + } + std::string metadata; + for (const auto& f : warm_file_set_) { + metadata += f; + metadata += "\n"; + } + IOStatus s = WriteStringToFile(target(), metadata, metadata_file_name_, true); + if (!s.ok()) { + fprintf(stderr, "Error writing to file %s: %s", metadata_file_name_.c_str(), + s.ToString().c_str()); + } +} + +IOStatus SimulatedHybridFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) { + Temperature temperature = Temperature::kUnknown; + if (is_full_fs_warm_) { + temperature = Temperature::kWarm; + } else { + const std::lock_guard<std::mutex> lock(mutex_); + if (warm_file_set_.find(fname) != warm_file_set_.end()) { + temperature = Temperature::kWarm; + } + assert(temperature == file_opts.temperature); + } + IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg); + result->reset( + new SimulatedHybridRaf(std::move(*result), rate_limiter_, temperature)); + return s; +} + +IOStatus SimulatedHybridFileSystem::NewWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) { + if (file_opts.temperature == Temperature::kWarm) { + const std::lock_guard<std::mutex> lock(mutex_); + warm_file_set_.insert(fname); + } + + IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg); + if (file_opts.temperature == Temperature::kWarm || is_full_fs_warm_) { + result->reset(new SimulatedWritableFile(std::move(*result), rate_limiter_)); + } + return s; +} + +IOStatus SimulatedHybridFileSystem::DeleteFile(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + { + const std::lock_guard<std::mutex> lock(mutex_); + warm_file_set_.erase(fname); + } + return target()->DeleteFile(fname, options, dbg); +} + +IOStatus SimulatedHybridRaf::Read(uint64_t offset, size_t n, + const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) const { + if (temperature_ == Temperature::kWarm) { + SimulateIOWait(n); + } + return target()->Read(offset, n, options, result, scratch, dbg); +} + +IOStatus SimulatedHybridRaf::MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, + IODebugContext* dbg) { + if (temperature_ == Temperature::kWarm) { + for (size_t i = 0; i < num_reqs; i++) { + SimulateIOWait(reqs[i].len); + } + } + return target()->MultiRead(reqs, num_reqs, options, dbg); +} + +IOStatus SimulatedHybridRaf::Prefetch(uint64_t offset, size_t n, + const IOOptions& options, + IODebugContext* dbg) { + if (temperature_ == Temperature::kWarm) { + SimulateIOWait(n); + } + return target()->Prefetch(offset, n, options, dbg); +} + +void SimulatedHybridRaf::SimulateIOWait(int64_t bytes) const { + int serve_time = CalculateServeTimeUs(bytes); + { + StopWatchNano stop_watch(Env::Default()->GetSystemClock().get(), + /*auto_start=*/true); + RateLimiterRequest(rate_limiter_.get(), serve_time); + int time_passed_us = static_cast<int>(stop_watch.ElapsedNanos() / 1000); + if (time_passed_us < serve_time) { + Env::Default()->SleepForMicroseconds(serve_time - time_passed_us); + } + } +} + +void SimulatedWritableFile::SimulateIOWait(int64_t bytes) const { + int serve_time = CalculateServeTimeUs(bytes); + Env::Default()->SleepForMicroseconds(serve_time); + RateLimiterRequest(rate_limiter_.get(), serve_time); +} + +IOStatus SimulatedWritableFile::Append(const Slice& data, const IOOptions& ioo, + IODebugContext* idc) { + if (use_direct_io()) { + SimulateIOWait(data.size()); + } else { + unsynced_bytes += data.size(); + } + return target()->Append(data, ioo, idc); +} + +IOStatus SimulatedWritableFile::Append( + const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, IODebugContext* dbg) { + if (use_direct_io()) { + SimulateIOWait(data.size()); + } else { + unsynced_bytes += data.size(); + } + return target()->Append(data, options, verification_info, dbg); +} + +IOStatus SimulatedWritableFile::PositionedAppend(const Slice& data, + uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) { + if (use_direct_io()) { + SimulateIOWait(data.size()); + } else { + // This might be overcalculated, but it's probably OK. + unsynced_bytes += data.size(); + } + return target()->PositionedAppend(data, offset, options, dbg); +} +IOStatus SimulatedWritableFile::PositionedAppend( + const Slice& data, uint64_t offset, const IOOptions& options, + const DataVerificationInfo& verification_info, IODebugContext* dbg) { + if (use_direct_io()) { + SimulateIOWait(data.size()); + } else { + // This might be overcalculated, but it's probably OK. + unsynced_bytes += data.size(); + } + return target()->PositionedAppend(data, offset, options, verification_info, + dbg); +} + +IOStatus SimulatedWritableFile::Sync(const IOOptions& options, + IODebugContext* dbg) { + if (unsynced_bytes > 0) { + SimulateIOWait(unsynced_bytes); + unsynced_bytes = 0; + } + return target()->Sync(options, dbg); +} +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/simulated_hybrid_file_system.h b/src/rocksdb/tools/simulated_hybrid_file_system.h new file mode 100644 index 000000000..251d89df7 --- /dev/null +++ b/src/rocksdb/tools/simulated_hybrid_file_system.h @@ -0,0 +1,126 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <utility> + +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { + +// A FileSystem simulates hybrid file system by ingesting latency and limit +// IOPs. +// This class is only used for development purpose and should not be used +// in production. +// Right now we ingest 15ms latency and allow 100 requests per second when +// the file is for warm temperature. +// When the object is destroyed, the list of warm files are written to a +// file, which can be used to reopen a FileSystem and still recover the +// list. This is to allow the information to preserve between db_bench +// runs. +class SimulatedHybridFileSystem : public FileSystemWrapper { + public: + // metadata_file_name stores metadata of the files, so that it can be + // loaded after process restarts. If the file doesn't exist, create + // one. The file is written when the class is destroyed. + // throughput_multiplier: multiplier of throughput. For example, 1 is to + // simulate single disk spindle. 4 is to simualte 4 disk spindles. + // is_full_fs_warm: if true, all files are all included in slow I/O + // simulation. + SimulatedHybridFileSystem(const std::shared_ptr<FileSystem>& base, + const std::string& metadata_file_name, + int throughput_multiplier, bool is_full_fs_warm); + + ~SimulatedHybridFileSystem() override; + + public: + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSRandomAccessFile>* result, + IODebugContext* dbg) override; + IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override; + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + const char* Name() const override { return name_.c_str(); } + + private: + // Limit 100 requests per second. Rate limiter is designed to byte but + // we use it as fixed bytes is one request. + std::shared_ptr<RateLimiter> rate_limiter_; + std::mutex mutex_; + std::unordered_set<std::string> warm_file_set_; + std::string metadata_file_name_; + std::string name_; + bool is_full_fs_warm_; +}; + +// Simulated random access file that can control IOPs and latency to simulate +// specific storage media +class SimulatedHybridRaf : public FSRandomAccessFileOwnerWrapper { + public: + SimulatedHybridRaf(std::unique_ptr<FSRandomAccessFile>&& t, + std::shared_ptr<RateLimiter> rate_limiter, + Temperature temperature) + : FSRandomAccessFileOwnerWrapper(std::move(t)), + rate_limiter_(rate_limiter), + temperature_(temperature) {} + + ~SimulatedHybridRaf() override {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override; + + private: + std::shared_ptr<RateLimiter> rate_limiter_; + Temperature temperature_; + + void SimulateIOWait(int64_t num_requests) const; +}; + +class SimulatedWritableFile : public FSWritableFileWrapper { + public: + SimulatedWritableFile(std::unique_ptr<FSWritableFile>&& t, + std::shared_ptr<RateLimiter> rate_limiter) + : FSWritableFileWrapper(t.get()), + file_guard_(std::move(t)), + rate_limiter_(rate_limiter) {} + IOStatus Append(const Slice& data, const IOOptions&, + IODebugContext*) override; + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override; + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override; + + private: + std::unique_ptr<FSWritableFile> file_guard_; + std::shared_ptr<RateLimiter> rate_limiter_; + size_t unsynced_bytes = 0; + + void SimulateIOWait(int64_t num_requests) const; +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/sst_dump.cc b/src/rocksdb/tools/sst_dump.cc new file mode 100644 index 000000000..becf67316 --- /dev/null +++ b/src/rocksdb/tools/sst_dump.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE + +#include "rocksdb/sst_dump_tool.h" + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::SSTDumpTool tool; + return tool.Run(argc, argv); +} +#else +#include <stdio.h> +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "Not supported in lite mode.\n"); + return 1; +} +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/sst_dump_test.cc b/src/rocksdb/tools/sst_dump_test.cc new file mode 100644 index 000000000..aa1ff810f --- /dev/null +++ b/src/rocksdb/tools/sst_dump_test.cc @@ -0,0 +1,481 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef ROCKSDB_LITE + +#include <stdint.h> + +#include "file/random_access_file_reader.h" +#include "port/stack_trace.h" +#include "rocksdb/convenience.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/sst_dump_tool.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/table_builder.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +const uint32_t kOptLength = 1024; + +namespace { +static std::string MakeKey(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "k_%04d", i); + InternalKey key(std::string(buf), 0, ValueType::kTypeValue); + return key.Encode().ToString(); +} + +static std::string MakeKeyWithTimeStamp(int i, uint64_t ts) { + char buf[100]; + snprintf(buf, sizeof(buf), "k_%04d", i); + return test::KeyStr(ts, std::string(buf), /*seq=*/0, kTypeValue); +} + +static std::string MakeValue(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "v_%04d", i); + InternalKey key(std::string(buf), 0, ValueType::kTypeValue); + return key.Encode().ToString(); +} + +void cleanup(const Options& opts, const std::string& file_name) { + Env* env = opts.env; + ASSERT_OK(env->DeleteFile(file_name)); + std::string outfile_name = file_name.substr(0, file_name.length() - 4); + outfile_name.append("_dump.txt"); + env->DeleteFile(outfile_name).PermitUncheckedError(); +} +} // namespace + +// Test for sst dump tool "raw" mode +class SSTDumpToolTest : public testing::Test { + std::string test_dir_; + Env* env_; + std::shared_ptr<Env> env_guard_; + + public: + SSTDumpToolTest() : env_(Env::Default()) { + EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_)); + test_dir_ = test::PerThreadDBPath(env_, "sst_dump_test_db"); + Status s = env_->CreateDirIfMissing(test_dir_); + EXPECT_OK(s); + } + + ~SSTDumpToolTest() override { + if (getenv("KEEP_DB")) { + fprintf(stdout, "Data is still at %s\n", test_dir_.c_str()); + } else { + EXPECT_OK(env_->DeleteDir(test_dir_)); + } + } + + Env* env() { return env_; } + + std::string MakeFilePath(const std::string& file_name) const { + std::string path(test_dir_); + path.append("/").append(file_name); + return path; + } + + template <std::size_t N> + void PopulateCommandArgs(const std::string& file_path, const char* command, + char* (&usage)[N]) const { + for (int i = 0; i < static_cast<int>(N); ++i) { + usage[i] = new char[kOptLength]; + } + snprintf(usage[0], kOptLength, "./sst_dump"); + snprintf(usage[1], kOptLength, "%s", command); + snprintf(usage[2], kOptLength, "--file=%s", file_path.c_str()); + } + + void createSST(const Options& opts, const std::string& file_name) { + Env* test_env = opts.env; + FileOptions file_options(opts); + ReadOptions read_options; + const ImmutableOptions imoptions(opts); + const MutableCFOptions moptions(opts); + ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator); + std::unique_ptr<TableBuilder> tb; + + IntTblPropCollectorFactories int_tbl_prop_collector_factories; + std::unique_ptr<WritableFileWriter> file_writer; + ASSERT_OK(WritableFileWriter::Create(test_env->GetFileSystem(), file_name, + file_options, &file_writer, nullptr)); + + std::string column_family_name; + int unknown_level = -1; + tb.reset(opts.table_factory->NewTableBuilder( + TableBuilderOptions( + imoptions, moptions, ikc, &int_tbl_prop_collector_factories, + CompressionType::kNoCompression, CompressionOptions(), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + column_family_name, unknown_level), + file_writer.get())); + + // Populate slightly more than 1K keys + uint32_t num_keys = kNumKey; + const char* comparator_name = ikc.user_comparator()->Name(); + if (strcmp(comparator_name, ReverseBytewiseComparator()->Name()) == 0) { + for (int32_t i = num_keys; i >= 0; i--) { + tb->Add(MakeKey(i), MakeValue(i)); + } + } else if (strcmp(comparator_name, + test::BytewiseComparatorWithU64TsWrapper()->Name()) == + 0) { + for (uint32_t i = 0; i < num_keys; i++) { + tb->Add(MakeKeyWithTimeStamp(i, 100 + i), MakeValue(i)); + } + } else { + for (uint32_t i = 0; i < num_keys; i++) { + tb->Add(MakeKey(i), MakeValue(i)); + } + } + ASSERT_OK(tb->Finish()); + ASSERT_OK(file_writer->Close()); + } + + protected: + constexpr static int kNumKey = 1024; +}; + +constexpr int SSTDumpToolTest::kNumKey; + +TEST_F(SSTDumpToolTest, HelpAndVersion) { + Options opts; + opts.env = env(); + + ROCKSDB_NAMESPACE::SSTDumpTool tool; + + static const char* help[] = {"./sst_dump", "--help"}; + ASSERT_TRUE(!tool.Run(2, help, opts)); + static const char* version[] = {"./sst_dump", "--version"}; + ASSERT_TRUE(!tool.Run(2, version, opts)); + static const char* bad[] = {"./sst_dump", "--not_an_option"}; + ASSERT_TRUE(tool.Run(2, bad, opts)); +} + +TEST_F(SSTDumpToolTest, EmptyFilter) { + Options opts; + opts.env = env(); + std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); + createSST(opts, file_path); + + char* usage[3]; + PopulateCommandArgs(file_path, "--command=raw", usage); + + ROCKSDB_NAMESPACE::SSTDumpTool tool; + ASSERT_TRUE(!tool.Run(3, usage, opts)); + + cleanup(opts, file_path); + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} + +TEST_F(SSTDumpToolTest, SstDumpReverseBytewiseComparator) { + Options opts; + opts.env = env(); + opts.comparator = ReverseBytewiseComparator(); + BlockBasedTableOptions table_opts; + table_opts.filter_policy.reset( + ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false)); + opts.table_factory.reset(new BlockBasedTableFactory(table_opts)); + std::string file_path = + MakeFilePath("rocksdb_sst_reverse_bytewise_comparator.sst"); + createSST(opts, file_path); + + char* usage[3]; + PopulateCommandArgs(file_path, "--command=raw", usage); + + ROCKSDB_NAMESPACE::SSTDumpTool tool; + ASSERT_TRUE(!tool.Run(3, usage, opts)); + + cleanup(opts, file_path); + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} + +TEST_F(SSTDumpToolTest, SstDumpComparatorWithU64Ts) { + Options opts; + opts.env = env(); + opts.comparator = test::BytewiseComparatorWithU64TsWrapper(); + BlockBasedTableOptions table_opts; + table_opts.filter_policy.reset( + ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false)); + opts.table_factory.reset(new BlockBasedTableFactory(table_opts)); + std::string file_path = + MakeFilePath("rocksdb_sst_comparator_with_u64_ts.sst"); + createSST(opts, file_path); + + char* usage[3]; + PopulateCommandArgs(file_path, "--command=raw", usage); + + ROCKSDB_NAMESPACE::SSTDumpTool tool; + ASSERT_TRUE(!tool.Run(3, usage, opts)); + + cleanup(opts, file_path); + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} + +TEST_F(SSTDumpToolTest, FilterBlock) { + Options opts; + opts.env = env(); + BlockBasedTableOptions table_opts; + table_opts.filter_policy.reset( + ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, true)); + opts.table_factory.reset(new BlockBasedTableFactory(table_opts)); + std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); + createSST(opts, file_path); + + char* usage[3]; + PopulateCommandArgs(file_path, "--command=raw", usage); + + ROCKSDB_NAMESPACE::SSTDumpTool tool; + ASSERT_TRUE(!tool.Run(3, usage, opts)); + + cleanup(opts, file_path); + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} + +TEST_F(SSTDumpToolTest, FullFilterBlock) { + Options opts; + opts.env = env(); + BlockBasedTableOptions table_opts; + table_opts.filter_policy.reset( + ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false)); + opts.table_factory.reset(new BlockBasedTableFactory(table_opts)); + std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); + createSST(opts, file_path); + + char* usage[3]; + PopulateCommandArgs(file_path, "--command=raw", usage); + + ROCKSDB_NAMESPACE::SSTDumpTool tool; + ASSERT_TRUE(!tool.Run(3, usage, opts)); + + cleanup(opts, file_path); + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} + +TEST_F(SSTDumpToolTest, GetProperties) { + Options opts; + opts.env = env(); + BlockBasedTableOptions table_opts; + table_opts.filter_policy.reset( + ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false)); + opts.table_factory.reset(new BlockBasedTableFactory(table_opts)); + std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); + createSST(opts, file_path); + + char* usage[3]; + PopulateCommandArgs(file_path, "--show_properties", usage); + + ROCKSDB_NAMESPACE::SSTDumpTool tool; + ASSERT_TRUE(!tool.Run(3, usage, opts)); + + cleanup(opts, file_path); + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} + +TEST_F(SSTDumpToolTest, CompressedSizes) { + Options opts; + opts.env = env(); + BlockBasedTableOptions table_opts; + table_opts.filter_policy.reset( + ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false)); + opts.table_factory.reset(new BlockBasedTableFactory(table_opts)); + std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); + createSST(opts, file_path); + + char* usage[3]; + PopulateCommandArgs(file_path, "--command=recompress", usage); + + ROCKSDB_NAMESPACE::SSTDumpTool tool; + ASSERT_TRUE(!tool.Run(3, usage, opts)); + + cleanup(opts, file_path); + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} + +TEST_F(SSTDumpToolTest, MemEnv) { + std::unique_ptr<Env> mem_env(NewMemEnv(env())); + Options opts; + opts.env = mem_env.get(); + std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); + createSST(opts, file_path); + + char* usage[3]; + PopulateCommandArgs(file_path, "--command=verify_checksum", usage); + + ROCKSDB_NAMESPACE::SSTDumpTool tool; + ASSERT_TRUE(!tool.Run(3, usage, opts)); + + cleanup(opts, file_path); + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} + +TEST_F(SSTDumpToolTest, ReadaheadSize) { + Options opts; + opts.env = env(); + std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); + createSST(opts, file_path); + + char* usage[4]; + PopulateCommandArgs(file_path, "--command=verify", usage); + snprintf(usage[3], kOptLength, "--readahead_size=4000000"); + + int num_reads = 0; + SyncPoint::GetInstance()->SetCallBack("RandomAccessFileReader::Read", + [&](void*) { num_reads++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + SSTDumpTool tool; + ASSERT_TRUE(!tool.Run(4, usage, opts)); + + // The file is approximately 10MB. Readahead is 4MB. + // We usually need 3 reads + one metadata read. + // One extra read is needed before opening the file for metadata. + ASSERT_EQ(5, num_reads); + + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + + cleanup(opts, file_path); + for (int i = 0; i < 4; i++) { + delete[] usage[i]; + } +} + +TEST_F(SSTDumpToolTest, NoSstFile) { + Options opts; + opts.env = env(); + std::string file_path = MakeFilePath("no_such_file.sst"); + char* usage[3]; + PopulateCommandArgs(file_path, "", usage); + ROCKSDB_NAMESPACE::SSTDumpTool tool; + for (const auto& command : + {"--command=check", "--command=dump", "--command=raw", + "--command=verify", "--command=recompress", "--command=verify_checksum", + "--show_properties"}) { + snprintf(usage[1], kOptLength, "%s", command); + ASSERT_TRUE(tool.Run(3, usage, opts)); + } + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} + +TEST_F(SSTDumpToolTest, ValidSSTPath) { + Options opts; + opts.env = env(); + char* usage[3]; + PopulateCommandArgs("", "", usage); + SSTDumpTool tool; + std::string file_not_exists = MakeFilePath("file_not_exists.sst"); + std::string sst_file = MakeFilePath("rocksdb_sst_test.sst"); + createSST(opts, sst_file); + std::string text_file = MakeFilePath("text_file"); + ASSERT_OK(WriteStringToFile(opts.env, "Hello World!", text_file)); + std::string fake_sst = MakeFilePath("fake_sst.sst"); + ASSERT_OK(WriteStringToFile(opts.env, "Not an SST file!", fake_sst)); + + for (const auto& command_arg : {"--command=verify", "--command=identify"}) { + snprintf(usage[1], kOptLength, "%s", command_arg); + + snprintf(usage[2], kOptLength, "--file=%s", file_not_exists.c_str()); + ASSERT_TRUE(tool.Run(3, usage, opts)); + + snprintf(usage[2], kOptLength, "--file=%s", sst_file.c_str()); + ASSERT_TRUE(!tool.Run(3, usage, opts)); + + snprintf(usage[2], kOptLength, "--file=%s", text_file.c_str()); + ASSERT_TRUE(tool.Run(3, usage, opts)); + + snprintf(usage[2], kOptLength, "--file=%s", fake_sst.c_str()); + ASSERT_TRUE(tool.Run(3, usage, opts)); + } + ASSERT_OK(opts.env->DeleteFile(sst_file)); + ASSERT_OK(opts.env->DeleteFile(text_file)); + ASSERT_OK(opts.env->DeleteFile(fake_sst)); + + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} + +TEST_F(SSTDumpToolTest, RawOutput) { + Options opts; + opts.env = env(); + std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); + createSST(opts, file_path); + + char* usage[3]; + PopulateCommandArgs(file_path, "--command=raw", usage); + + ROCKSDB_NAMESPACE::SSTDumpTool tool; + ASSERT_TRUE(!tool.Run(3, usage, opts)); + + const std::string raw_path = MakeFilePath("rocksdb_sst_test_dump.txt"); + std::ifstream raw_file(raw_path); + + std::string tp; + bool is_data_block = false; + int key_count = 0; + while (getline(raw_file, tp)) { + if (tp.find("Data Block #") != std::string::npos) { + is_data_block = true; + } + + if (is_data_block && tp.find("HEX") != std::string::npos) { + key_count++; + } + } + + ASSERT_EQ(kNumKey, key_count); + + raw_file.close(); + + cleanup(opts, file_path); + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as SSTDumpTool is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE return RUN_ALL_TESTS(); diff --git a/src/rocksdb/tools/sst_dump_tool.cc b/src/rocksdb/tools/sst_dump_tool.cc new file mode 100644 index 000000000..0a2c28280 --- /dev/null +++ b/src/rocksdb/tools/sst_dump_tool.cc @@ -0,0 +1,584 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE + +#include "rocksdb/sst_dump_tool.h" + +#include <cinttypes> +#include <iostream> + +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/utilities/ldb_cmd.h" +#include "table/sst_file_dumper.h" + +namespace ROCKSDB_NAMESPACE { + +static const std::vector<std::pair<CompressionType, const char*>> + kCompressions = { + {CompressionType::kNoCompression, "kNoCompression"}, + {CompressionType::kSnappyCompression, "kSnappyCompression"}, + {CompressionType::kZlibCompression, "kZlibCompression"}, + {CompressionType::kBZip2Compression, "kBZip2Compression"}, + {CompressionType::kLZ4Compression, "kLZ4Compression"}, + {CompressionType::kLZ4HCCompression, "kLZ4HCCompression"}, + {CompressionType::kXpressCompression, "kXpressCompression"}, + {CompressionType::kZSTD, "kZSTD"}}; + +namespace { + +void print_help(bool to_stderr) { + std::string supported_compressions; + for (CompressionType ct : GetSupportedCompressions()) { + if (!supported_compressions.empty()) { + supported_compressions += ", "; + } + std::string str; + Status s = GetStringFromCompressionType(&str, ct); + assert(s.ok()); + supported_compressions += str; + } + fprintf( + to_stderr ? stderr : stdout, + R"(sst_dump --file=<data_dir_OR_sst_file> [--command=check|scan|raw|recompress|identify] + --file=<data_dir_OR_sst_file> + Path to SST file or directory containing SST files + + --env_uri=<uri of underlying Env> + URI of underlying Env, mutually exclusive with fs_uri + + --fs_uri=<uri of underlying FileSystem> + URI of underlying FileSystem, mutually exclusive with env_uri + + --command=check|scan|raw|verify|identify + check: Iterate over entries in files but don't print anything except if an error is encountered (default command) + scan: Iterate over entries in files and print them to screen + raw: Dump all the table contents to <file_name>_dump.txt + verify: Iterate all the blocks in files verifying checksum to detect possible corruption but don't print anything except if a corruption is encountered + recompress: reports the SST file size if recompressed with different + compression types + identify: Reports a file is a valid SST file or lists all valid SST files under a directory + + --output_hex + Can be combined with scan command to print the keys and values in Hex + + --decode_blob_index + Decode blob indexes and print them in a human-readable format during scans. + + --from=<user_key> + Key to start reading from when executing check|scan + + --to=<user_key> + Key to stop reading at when executing check|scan + + --prefix=<user_key> + Returns all keys with this prefix when executing check|scan + Cannot be used in conjunction with --from + + --read_num=<num> + Maximum number of entries to read when executing check|scan + + --verify_checksum + Verify file checksum when executing check|scan + + --input_key_hex + Can be combined with --from and --to to indicate that these values are encoded in Hex + + --show_properties + Print table properties after iterating over the file when executing + check|scan|raw|identify + + --set_block_size=<block_size> + Can be combined with --command=recompress to set the block size that will + be used when trying different compression algorithms + + --compression_types=<comma-separated list of CompressionType members, e.g., + kSnappyCompression> + Can be combined with --command=recompress to run recompression for this + list of compression types + Supported compression types: %s + + --parse_internal_key=<0xKEY> + Convenience option to parse an internal key on the command line. Dumps the + internal key in hex format {'key' @ SN: type} + + --compression_level_from=<compression_level> + Compression level to start compressing when executing recompress. One compression type + and compression_level_to must also be specified + + --compression_level_to=<compression_level> + Compression level to stop compressing when executing recompress. One compression type + and compression_level_from must also be specified + + --compression_max_dict_bytes=<uint32_t> + Maximum size of dictionary used to prime the compression library + + --compression_zstd_max_train_bytes=<uint32_t> + Maximum size of training data passed to zstd's dictionary trainer + + --compression_max_dict_buffer_bytes=<int64_t> + Limit on buffer size from which we collect samples for dictionary generation. + + --compression_use_zstd_finalize_dict + Use zstd's finalizeDictionary() API instead of zstd's dictionary trainer to generate dictionary. +)", + supported_compressions.c_str()); +} + +// arg_name would include all prefix, e.g. "--my_arg=" +// arg_val is the parses value. +// True if there is a match. False otherwise. +// Woud exit after printing errmsg if cannot be parsed. +bool ParseIntArg(const char* arg, const std::string arg_name, + const std::string err_msg, int64_t* arg_val) { + if (strncmp(arg, arg_name.c_str(), arg_name.size()) == 0) { + std::string input_str = arg + arg_name.size(); + std::istringstream iss(input_str); + iss >> *arg_val; + if (iss.fail()) { + fprintf(stderr, "%s\n", err_msg.c_str()); + exit(1); + } + return true; + } + return false; +} +} // namespace + +int SSTDumpTool::Run(int argc, char const* const* argv, Options options) { + std::string env_uri, fs_uri; + const char* dir_or_file = nullptr; + uint64_t read_num = std::numeric_limits<uint64_t>::max(); + std::string command; + + char junk; + uint64_t n; + bool verify_checksum = false; + bool output_hex = false; + bool decode_blob_index = false; + bool input_key_hex = false; + bool has_from = false; + bool has_to = false; + bool use_from_as_prefix = false; + bool show_properties = false; + bool show_summary = false; + bool set_block_size = false; + bool has_compression_level_from = false; + bool has_compression_level_to = false; + bool has_specified_compression_types = false; + std::string from_key; + std::string to_key; + std::string block_size_str; + std::string compression_level_from_str; + std::string compression_level_to_str; + size_t block_size = 0; + size_t readahead_size = 2 * 1024 * 1024; + std::vector<std::pair<CompressionType, const char*>> compression_types; + uint64_t total_num_files = 0; + uint64_t total_num_data_blocks = 0; + uint64_t total_data_block_size = 0; + uint64_t total_index_block_size = 0; + uint64_t total_filter_block_size = 0; + int32_t compress_level_from = CompressionOptions::kDefaultCompressionLevel; + int32_t compress_level_to = CompressionOptions::kDefaultCompressionLevel; + uint32_t compression_max_dict_bytes = + ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes; + uint32_t compression_zstd_max_train_bytes = + ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes; + uint64_t compression_max_dict_buffer_bytes = + ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes; + bool compression_use_zstd_finalize_dict = + !ROCKSDB_NAMESPACE::CompressionOptions().use_zstd_dict_trainer; + + int64_t tmp_val; + + for (int i = 1; i < argc; i++) { + if (strncmp(argv[i], "--env_uri=", 10) == 0) { + env_uri = argv[i] + 10; + } else if (strncmp(argv[i], "--fs_uri=", 9) == 0) { + fs_uri = argv[i] + 9; + } else if (strncmp(argv[i], "--file=", 7) == 0) { + dir_or_file = argv[i] + 7; + } else if (strcmp(argv[i], "--output_hex") == 0) { + output_hex = true; + } else if (strcmp(argv[i], "--decode_blob_index") == 0) { + decode_blob_index = true; + } else if (strcmp(argv[i], "--input_key_hex") == 0) { + input_key_hex = true; + } else if (sscanf(argv[i], "--read_num=%lu%c", (unsigned long*)&n, &junk) == + 1) { + read_num = n; + } else if (strcmp(argv[i], "--verify_checksum") == 0) { + verify_checksum = true; + } else if (strncmp(argv[i], "--command=", 10) == 0) { + command = argv[i] + 10; + } else if (strncmp(argv[i], "--from=", 7) == 0) { + from_key = argv[i] + 7; + has_from = true; + } else if (strncmp(argv[i], "--to=", 5) == 0) { + to_key = argv[i] + 5; + has_to = true; + } else if (strncmp(argv[i], "--prefix=", 9) == 0) { + from_key = argv[i] + 9; + use_from_as_prefix = true; + } else if (strcmp(argv[i], "--show_properties") == 0) { + show_properties = true; + } else if (strcmp(argv[i], "--show_summary") == 0) { + show_summary = true; + } else if (ParseIntArg(argv[i], "--set_block_size=", + "block size must be numeric", &tmp_val)) { + set_block_size = true; + block_size = static_cast<size_t>(tmp_val); + } else if (ParseIntArg(argv[i], "--readahead_size=", + "readahead_size must be numeric", &tmp_val)) { + readahead_size = static_cast<size_t>(tmp_val); + } else if (strncmp(argv[i], "--compression_types=", 20) == 0) { + std::string compression_types_csv = argv[i] + 20; + std::istringstream iss(compression_types_csv); + std::string compression_type; + has_specified_compression_types = true; + while (std::getline(iss, compression_type, ',')) { + auto iter = std::find_if( + kCompressions.begin(), kCompressions.end(), + [&compression_type](std::pair<CompressionType, const char*> curr) { + return curr.second == compression_type; + }); + if (iter == kCompressions.end()) { + fprintf(stderr, "%s is not a valid CompressionType\n", + compression_type.c_str()); + exit(1); + } + compression_types.emplace_back(*iter); + } + } else if (strncmp(argv[i], "--parse_internal_key=", 21) == 0) { + std::string in_key(argv[i] + 21); + try { + in_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(in_key); + } catch (...) { + std::cerr << "ERROR: Invalid key input '" << in_key + << "' Use 0x{hex representation of internal rocksdb key}" + << std::endl; + return -1; + } + Slice sl_key = ROCKSDB_NAMESPACE::Slice(in_key); + ParsedInternalKey ikey; + int retc = 0; + Status pik_status = + ParseInternalKey(sl_key, &ikey, true /* log_err_key */); + if (!pik_status.ok()) { + std::cerr << pik_status.getState() << "\n"; + retc = -1; + } + fprintf(stdout, "key=%s\n", ikey.DebugString(true, true).c_str()); + return retc; + } else if (ParseIntArg(argv[i], "--compression_level_from=", + "compression_level_from must be numeric", + &tmp_val)) { + has_compression_level_from = true; + compress_level_from = static_cast<int>(tmp_val); + } else if (ParseIntArg(argv[i], "--compression_level_to=", + "compression_level_to must be numeric", &tmp_val)) { + has_compression_level_to = true; + compress_level_to = static_cast<int>(tmp_val); + } else if (ParseIntArg(argv[i], "--compression_max_dict_bytes=", + "compression_max_dict_bytes must be numeric", + &tmp_val)) { + if (tmp_val < 0 || tmp_val > std::numeric_limits<uint32_t>::max()) { + fprintf(stderr, "compression_max_dict_bytes must be a uint32_t: '%s'\n", + argv[i]); + print_help(/*to_stderr*/ true); + return 1; + } + compression_max_dict_bytes = static_cast<uint32_t>(tmp_val); + } else if (ParseIntArg(argv[i], "--compression_zstd_max_train_bytes=", + "compression_zstd_max_train_bytes must be numeric", + &tmp_val)) { + if (tmp_val < 0 || tmp_val > std::numeric_limits<uint32_t>::max()) { + fprintf(stderr, + "compression_zstd_max_train_bytes must be a uint32_t: '%s'\n", + argv[i]); + print_help(/*to_stderr*/ true); + return 1; + } + compression_zstd_max_train_bytes = static_cast<uint32_t>(tmp_val); + } else if (ParseIntArg(argv[i], "--compression_max_dict_buffer_bytes=", + "compression_max_dict_buffer_bytes must be numeric", + &tmp_val)) { + if (tmp_val < 0) { + fprintf(stderr, + "compression_max_dict_buffer_bytes must be positive: '%s'\n", + argv[i]); + print_help(/*to_stderr*/ true); + return 1; + } + compression_max_dict_buffer_bytes = static_cast<uint64_t>(tmp_val); + } else if (strcmp(argv[i], "--compression_use_zstd_finalize_dict") == 0) { + compression_use_zstd_finalize_dict = true; + } else if (strcmp(argv[i], "--help") == 0) { + print_help(/*to_stderr*/ false); + return 0; + } else if (strcmp(argv[i], "--version") == 0) { + printf("%s\n", GetRocksBuildInfoAsString("sst_dump").c_str()); + return 0; + } else { + fprintf(stderr, "Unrecognized argument '%s'\n\n", argv[i]); + print_help(/*to_stderr*/ true); + return 1; + } + } + + if (has_compression_level_from && has_compression_level_to) { + if (!has_specified_compression_types || compression_types.size() != 1) { + fprintf(stderr, "Specify one compression type.\n\n"); + exit(1); + } + } else if (has_compression_level_from || has_compression_level_to) { + fprintf(stderr, + "Specify both --compression_level_from and " + "--compression_level_to.\n\n"); + exit(1); + } + + if (use_from_as_prefix && has_from) { + fprintf(stderr, "Cannot specify --prefix and --from\n\n"); + exit(1); + } + + if (input_key_hex) { + if (has_from || use_from_as_prefix) { + from_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(from_key); + } + if (has_to) { + to_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(to_key); + } + } + + if (dir_or_file == nullptr) { + fprintf(stderr, "file or directory must be specified.\n\n"); + print_help(/*to_stderr*/ true); + exit(1); + } + + std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard; + + // If caller of SSTDumpTool::Run(...) does not specify a different env other + // than Env::Default(), then try to load custom env based on env_uri/fs_uri. + // Otherwise, the caller is responsible for creating custom env. + { + ConfigOptions config_options; + config_options.env = options.env; + Status s = Env::CreateFromUri(config_options, env_uri, fs_uri, &options.env, + &env_guard); + if (!s.ok()) { + fprintf(stderr, "CreateEnvFromUri: %s\n", s.ToString().c_str()); + exit(1); + } else { + fprintf(stdout, "options.env is %p\n", options.env); + } + } + + std::vector<std::string> filenames; + ROCKSDB_NAMESPACE::Env* env = options.env; + ROCKSDB_NAMESPACE::Status st = env->GetChildren(dir_or_file, &filenames); + bool dir = true; + if (!st.ok() || filenames.empty()) { + // dir_or_file does not exist or does not contain children + // Check its existence first + Status s = env->FileExists(dir_or_file); + // dir_or_file does not exist + if (!s.ok()) { + fprintf(stderr, "%s%s: No such file or directory\n", s.ToString().c_str(), + dir_or_file); + return 1; + } + // dir_or_file exists and is treated as a "file" + // since it has no children + // This is ok since later it will be checked + // that whether it is a valid sst or not + // (A directory "file" is not a valid sst) + filenames.clear(); + filenames.push_back(dir_or_file); + dir = false; + } + + uint64_t total_read = 0; + // List of RocksDB SST file without corruption + std::vector<std::string> valid_sst_files; + for (size_t i = 0; i < filenames.size(); i++) { + std::string filename = filenames.at(i); + if (filename.length() <= 4 || + filename.rfind(".sst") != filename.length() - 4) { + // ignore + continue; + } + + if (dir) { + filename = std::string(dir_or_file) + "/" + filename; + } + + ROCKSDB_NAMESPACE::SstFileDumper dumper( + options, filename, Temperature::kUnknown, readahead_size, + verify_checksum, output_hex, decode_blob_index); + // Not a valid SST + if (!dumper.getStatus().ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), + dumper.getStatus().ToString().c_str()); + continue; + } else { + valid_sst_files.push_back(filename); + // Print out from and to key information once + // where there is at least one valid SST + if (valid_sst_files.size() == 1) { + // from_key and to_key are only used for "check", "scan", or "" + if (command == "check" || command == "scan" || command == "") { + fprintf(stdout, "from [%s] to [%s]\n", + ROCKSDB_NAMESPACE::Slice(from_key).ToString(true).c_str(), + ROCKSDB_NAMESPACE::Slice(to_key).ToString(true).c_str()); + } + } + } + + if (command == "recompress") { + st = dumper.ShowAllCompressionSizes( + set_block_size ? block_size : 16384, + compression_types.empty() ? kCompressions : compression_types, + compress_level_from, compress_level_to, compression_max_dict_bytes, + compression_zstd_max_train_bytes, compression_max_dict_buffer_bytes, + !compression_use_zstd_finalize_dict); + if (!st.ok()) { + fprintf(stderr, "Failed to recompress: %s\n", st.ToString().c_str()); + exit(1); + } + return 0; + } + + if (command == "raw") { + std::string out_filename = filename.substr(0, filename.length() - 4); + out_filename.append("_dump.txt"); + + st = dumper.DumpTable(out_filename); + if (!st.ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str()); + exit(1); + } else { + fprintf(stdout, "raw dump written to file %s\n", &out_filename[0]); + } + continue; + } + + // scan all files in give file path. + if (command == "" || command == "scan" || command == "check") { + st = dumper.ReadSequential( + command == "scan", read_num > 0 ? (read_num - total_read) : read_num, + has_from || use_from_as_prefix, from_key, has_to, to_key, + use_from_as_prefix); + if (!st.ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str()); + } + total_read += dumper.GetReadNumber(); + if (read_num > 0 && total_read > read_num) { + break; + } + } + + if (command == "verify") { + st = dumper.VerifyChecksum(); + if (!st.ok()) { + fprintf(stderr, "%s is corrupted: %s\n", filename.c_str(), + st.ToString().c_str()); + } else { + fprintf(stdout, "The file is ok\n"); + } + continue; + } + + if (show_properties || show_summary) { + const ROCKSDB_NAMESPACE::TableProperties* table_properties; + + std::shared_ptr<const ROCKSDB_NAMESPACE::TableProperties> + table_properties_from_reader; + st = dumper.ReadTableProperties(&table_properties_from_reader); + if (!st.ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str()); + fprintf(stderr, "Try to use initial table properties\n"); + table_properties = dumper.GetInitTableProperties(); + } else { + table_properties = table_properties_from_reader.get(); + } + if (table_properties != nullptr) { + if (show_properties) { + fprintf(stdout, + "Table Properties:\n" + "------------------------------\n" + " %s", + table_properties->ToString("\n ", ": ").c_str()); + } + total_num_files += 1; + total_num_data_blocks += table_properties->num_data_blocks; + total_data_block_size += table_properties->data_size; + total_index_block_size += table_properties->index_size; + total_filter_block_size += table_properties->filter_size; + if (show_properties) { + fprintf(stdout, + "Raw user collected properties\n" + "------------------------------\n"); + for (const auto& kv : table_properties->user_collected_properties) { + std::string prop_name = kv.first; + std::string prop_val = Slice(kv.second).ToString(true); + fprintf(stdout, " # %s: 0x%s\n", prop_name.c_str(), + prop_val.c_str()); + } + } + } else { + fprintf(stderr, "Reader unexpectedly returned null properties\n"); + } + } + } + if (show_summary) { + fprintf(stdout, "total number of files: %" PRIu64 "\n", total_num_files); + fprintf(stdout, "total number of data blocks: %" PRIu64 "\n", + total_num_data_blocks); + fprintf(stdout, "total data block size: %" PRIu64 "\n", + total_data_block_size); + fprintf(stdout, "total index block size: %" PRIu64 "\n", + total_index_block_size); + fprintf(stdout, "total filter block size: %" PRIu64 "\n", + total_filter_block_size); + } + + if (valid_sst_files.empty()) { + // No valid SST files are found + // Exit with an error state + if (dir) { + fprintf(stdout, "------------------------------\n"); + fprintf(stderr, "No valid SST files found in %s\n", dir_or_file); + } else { + fprintf(stderr, "%s is not a valid SST file\n", dir_or_file); + } + return 1; + } else { + if (command == "identify") { + if (dir) { + fprintf(stdout, "------------------------------\n"); + fprintf(stdout, "List of valid SST files found in %s:\n", dir_or_file); + for (const auto& f : valid_sst_files) { + fprintf(stdout, "%s\n", f.c_str()); + } + fprintf(stdout, "Number of valid SST files: %zu\n", + valid_sst_files.size()); + } else { + fprintf(stdout, "%s is a valid SST file\n", dir_or_file); + } + } + // At least one valid SST + // exit with a success state + return 0; + } +} +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/trace_analyzer.cc b/src/rocksdb/tools/trace_analyzer.cc new file mode 100644 index 000000000..958078d1c --- /dev/null +++ b/src/rocksdb/tools/trace_analyzer.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else +#include "tools/trace_analyzer_tool.h" +int main(int argc, char** argv) { + return ROCKSDB_NAMESPACE::trace_analyzer_tool(argc, argv); +} +#endif +#else +#include <stdio.h> +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "Not supported in lite mode.\n"); + return 1; +} +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/trace_analyzer_test.cc b/src/rocksdb/tools/trace_analyzer_test.cc new file mode 100644 index 000000000..d7f9e4da8 --- /dev/null +++ b/src/rocksdb/tools/trace_analyzer_test.cc @@ -0,0 +1,890 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, "Please install gflags to run trace_analyzer test\n"); + return 0; +} +#else + +#include <chrono> +#include <cstdio> +#include <cstdlib> +#include <sstream> +#include <thread> + +#include "db/db_test_util.h" +#include "file/line_file_reader.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_reader_writer.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "tools/trace_analyzer_tool.h" +#include "trace_replay/trace_replay.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +static const int kMaxArgCount = 100; +static const size_t kArgBufferSize = 100000; +} // namespace + +// Note that, the QPS part verification of the analyzing result is not robost +// enough and causes the failure in some rare cases. Disable them temporally and +// wait for future refactor. + +// The helper functions for the test +class TraceAnalyzerTest : public testing::Test { + public: + TraceAnalyzerTest() : rnd_(0xFB) { + // test_path_ = test::TmpDir() + "trace_analyzer_test"; + test_path_ = test::PerThreadDBPath("trace_analyzer_test"); + env_ = ROCKSDB_NAMESPACE::Env::Default(); + env_->CreateDir(test_path_).PermitUncheckedError(); + dbname_ = test_path_ + "/db"; + } + + ~TraceAnalyzerTest() override {} + + void GenerateTrace(std::string trace_path) { + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreatePutOperator(); + Slice upper_bound("a"); + Slice lower_bound("abce"); + ReadOptions ro; + ro.iterate_upper_bound = &upper_bound; + ro.iterate_lower_bound = &lower_bound; + WriteOptions wo; + TraceOptions trace_opt; + DB* db_ = nullptr; + std::string value; + std::unique_ptr<TraceWriter> trace_writer; + Iterator* single_iter = nullptr; + + ASSERT_OK( + NewFileTraceWriter(env_, env_options_, trace_path, &trace_writer)); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + ASSERT_OK(db_->StartTrace(trace_opt, std::move(trace_writer))); + + WriteBatch batch; + ASSERT_OK(batch.Put("a", "aaaaaaaaa")); + ASSERT_OK(batch.Merge("b", "aaaaaaaaaaaaaaaaaaaa")); + ASSERT_OK(batch.Delete("c")); + ASSERT_OK(batch.SingleDelete("d")); + ASSERT_OK(batch.DeleteRange("e", "f")); + ASSERT_OK(db_->Write(wo, &batch)); + std::vector<Slice> keys; + keys.push_back("a"); + keys.push_back("b"); + keys.push_back("df"); + keys.push_back("gege"); + keys.push_back("hjhjhj"); + std::vector<std::string> values; + std::vector<Status> ss = db_->MultiGet(ro, keys, &values); + ASSERT_GE(ss.size(), 0); + ASSERT_OK(ss[0]); + ASSERT_NOK(ss[2]); + std::vector<ColumnFamilyHandle*> cfs(2, db_->DefaultColumnFamily()); + std::vector<PinnableSlice> values2(keys.size()); + db_->MultiGet(ro, 2, cfs.data(), keys.data(), values2.data(), ss.data(), + false); + ASSERT_OK(ss[0]); + db_->MultiGet(ro, db_->DefaultColumnFamily(), 2, keys.data() + 3, + values2.data(), ss.data(), false); + ASSERT_OK(db_->Get(ro, "a", &value)); + + single_iter = db_->NewIterator(ro); + single_iter->Seek("a"); + ASSERT_OK(single_iter->status()); + single_iter->SeekForPrev("b"); + ASSERT_OK(single_iter->status()); + delete single_iter; + std::this_thread::sleep_for(std::chrono::seconds(1)); + + db_->Get(ro, "g", &value).PermitUncheckedError(); + + ASSERT_OK(db_->EndTrace()); + + ASSERT_OK(env_->FileExists(trace_path)); + + std::unique_ptr<WritableFile> whole_f; + std::string whole_path = test_path_ + "/0.txt"; + ASSERT_OK(env_->NewWritableFile(whole_path, &whole_f, env_options_)); + std::string whole_str = "0x61\n0x62\n0x63\n0x64\n0x65\n0x66\n"; + ASSERT_OK(whole_f->Append(whole_str)); + delete db_; + ASSERT_OK(DestroyDB(dbname_, options)); + } + + void RunTraceAnalyzer(const std::vector<std::string>& args) { + char arg_buffer[kArgBufferSize]; + char* argv[kMaxArgCount]; + int argc = 0; + int cursor = 0; + + for (const auto& arg : args) { + ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize); + ASSERT_LE(argc + 1, kMaxArgCount); + snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str()); + + argv[argc++] = arg_buffer + cursor; + cursor += static_cast<int>(arg.size()) + 1; + } + + ASSERT_EQ(0, ROCKSDB_NAMESPACE::trace_analyzer_tool(argc, argv)); + } + + void CheckFileContent(const std::vector<std::string>& cnt, + std::string file_path, bool full_content) { + const auto& fs = env_->GetFileSystem(); + FileOptions fopts(env_options_); + + ASSERT_OK(fs->FileExists(file_path, fopts.io_options, nullptr)); + std::unique_ptr<FSSequentialFile> file; + ASSERT_OK(fs->NewSequentialFile(file_path, fopts, &file, nullptr)); + + LineFileReader lf_reader(std::move(file), file_path, + 4096 /* filereadahead_size */); + + std::vector<std::string> result; + std::string line; + while ( + lf_reader.ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */)) { + result.push_back(line); + } + + ASSERT_OK(lf_reader.GetStatus()); + + size_t min_size = std::min(cnt.size(), result.size()); + for (size_t i = 0; i < min_size; i++) { + if (full_content) { + ASSERT_EQ(result[i], cnt[i]); + } else { + ASSERT_EQ(result[i][0], cnt[i][0]); + } + } + + return; + } + + void AnalyzeTrace(std::vector<std::string>& paras_diff, + std::string output_path, std::string trace_path) { + std::vector<std::string> paras = {"./trace_analyzer", + "-convert_to_human_readable_trace", + "-output_key_stats", + "-output_access_count_stats", + "-output_prefix=test", + "-output_prefix_cut=1", + "-output_time_series", + "-output_value_distribution", + "-output_qps_stats", + "-no_key", + "-no_print"}; + for (auto& para : paras_diff) { + paras.push_back(para); + } + Status s = env_->FileExists(trace_path); + if (!s.ok()) { + GenerateTrace(trace_path); + } + ASSERT_OK(env_->CreateDir(output_path)); + RunTraceAnalyzer(paras); + } + + ROCKSDB_NAMESPACE::Env* env_; + EnvOptions env_options_; + std::string test_path_; + std::string dbname_; + Random rnd_; +}; + +TEST_F(TraceAnalyzerTest, Get) { + std::string trace_path = test_path_ + "/trace"; + std::string output_path = test_path_ + "/get"; + std::string file_path; + std::vector<std::string> paras = { + "-analyze_get=true", "-analyze_put=false", + "-analyze_delete=false", "-analyze_single_delete=false", + "-analyze_range_delete=false", "-analyze_iterator=false", + "-analyze_multiget=false"}; + paras.push_back("-output_dir=" + output_path); + paras.push_back("-trace_path=" + trace_path); + paras.push_back("-key_space_dir=" + test_path_); + AnalyzeTrace(paras, output_path, trace_path); + + // check the key_stats file + std::vector<std::string> k_stats = {"0 10 0 1 1.000000", "0 10 1 1 1.000000"}; + file_path = output_path + "/test-get-0-accessed_key_stats.txt"; + CheckFileContent(k_stats, file_path, true); + + // Check the access count distribution + std::vector<std::string> k_dist = {"access_count: 1 num: 2"}; + file_path = output_path + "/test-get-0-accessed_key_count_distribution.txt"; + CheckFileContent(k_dist, file_path, true); + + // Check the trace sequence + std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8", + "8", "8", "8", "8", "8", "8", + "8", "8", "0", "6", "7", "0"}; + file_path = output_path + "/test-human_readable_trace.txt"; + CheckFileContent(k_sequence, file_path, false); + + // Check the prefix + std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30", + "1 1 1 1.000000 1.000000 0x61"}; + file_path = output_path + "/test-get-0-accessed_key_prefix_cut.txt"; + CheckFileContent(k_prefix, file_path, true); + + // Check the time series + std::vector<std::string> k_series = {"0 1533000630 0", "0 1533000630 1"}; + file_path = output_path + "/test-get-0-time_series.txt"; + CheckFileContent(k_series, file_path, false); + + // Check the accessed key in whole key space + std::vector<std::string> k_whole_access = {"0 1"}; + file_path = output_path + "/test-get-0-whole_key_stats.txt"; + CheckFileContent(k_whole_access, file_path, true); + + // Check the whole key prefix cut + std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63", + "3 0x64", "4 0x65", "5 0x66"}; + file_path = output_path + "/test-get-0-whole_key_prefix_cut.txt"; + CheckFileContent(k_whole_prefix, file_path, true); + + /* + // Check the overall qps + std::vector<std::string> all_qps = {"1 0 0 0 0 0 0 0 0 1"}; + file_path = output_path + "/test-qps_stats.txt"; + CheckFileContent(all_qps, file_path, true); + + // Check the qps of get + std::vector<std::string> get_qps = {"1"}; + file_path = output_path + "/test-get-0-qps_stats.txt"; + CheckFileContent(get_qps, file_path, true); + + // Check the top k qps prefix cut + std::vector<std::string> top_qps = {"At time: 0 with QPS: 1", + "The prefix: 0x61 Access count: 1"}; + file_path = output_path + "/test-get-0-accessed_top_k_qps_prefix_cut.txt"; + CheckFileContent(top_qps, file_path, true); + */ +} + +// Test analyzing of Put +TEST_F(TraceAnalyzerTest, Put) { + std::string trace_path = test_path_ + "/trace"; + std::string output_path = test_path_ + "/put"; + std::string file_path; + std::vector<std::string> paras = { + "-analyze_get=false", "-analyze_put=true", + "-analyze_delete=false", "-analyze_single_delete=false", + "-analyze_range_delete=false", "-analyze_iterator=false", + "-analyze_multiget=false"}; + paras.push_back("-output_dir=" + output_path); + paras.push_back("-trace_path=" + trace_path); + paras.push_back("-key_space_dir=" + test_path_); + AnalyzeTrace(paras, output_path, trace_path); + + // check the key_stats file + std::vector<std::string> k_stats = {"0 9 0 1 1.000000"}; + file_path = output_path + "/test-put-0-accessed_key_stats.txt"; + CheckFileContent(k_stats, file_path, true); + + // Check the access count distribution + std::vector<std::string> k_dist = {"access_count: 1 num: 1"}; + file_path = output_path + "/test-put-0-accessed_key_count_distribution.txt"; + CheckFileContent(k_dist, file_path, true); + + // Check the trace sequence + std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8", + "8", "8", "8", "8", "8", "8", + "8", "8", "0", "6", "7", "0"}; + file_path = output_path + "/test-human_readable_trace.txt"; + CheckFileContent(k_sequence, file_path, false); + + // Check the prefix + std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"}; + file_path = output_path + "/test-put-0-accessed_key_prefix_cut.txt"; + CheckFileContent(k_prefix, file_path, true); + + // Check the time series + std::vector<std::string> k_series = {"1 1533056278 0"}; + file_path = output_path + "/test-put-0-time_series.txt"; + CheckFileContent(k_series, file_path, false); + + // Check the accessed key in whole key space + std::vector<std::string> k_whole_access = {"0 1"}; + file_path = output_path + "/test-put-0-whole_key_stats.txt"; + CheckFileContent(k_whole_access, file_path, true); + + // Check the whole key prefix cut + std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63", + "3 0x64", "4 0x65", "5 0x66"}; + file_path = output_path + "/test-put-0-whole_key_prefix_cut.txt"; + CheckFileContent(k_whole_prefix, file_path, true); + + // Check the overall qps + std::vector<std::string> all_qps = {"0 1 0 0 0 0 0 0 0 1"}; + file_path = output_path + "/test-qps_stats.txt"; + CheckFileContent(all_qps, file_path, true); + + /* + // Check the qps of Put + std::vector<std::string> get_qps = {"1"}; + file_path = output_path + "/test-put-0-qps_stats.txt"; + CheckFileContent(get_qps, file_path, true); + + // Check the top k qps prefix cut + std::vector<std::string> top_qps = {"At time: 0 with QPS: 1", + "The prefix: 0x61 Access count: 1"}; + file_path = output_path + "/test-put-0-accessed_top_k_qps_prefix_cut.txt"; + CheckFileContent(top_qps, file_path, true); + + // Check the value size distribution + std::vector<std::string> value_dist = { + "Number_of_value_size_between 0 and 16 is: 1"}; + file_path = output_path + "/test-put-0-accessed_value_size_distribution.txt"; + CheckFileContent(value_dist, file_path, true); + */ +} + +// Test analyzing of delete +TEST_F(TraceAnalyzerTest, Delete) { + std::string trace_path = test_path_ + "/trace"; + std::string output_path = test_path_ + "/delete"; + std::string file_path; + std::vector<std::string> paras = { + "-analyze_get=false", "-analyze_put=false", + "-analyze_delete=true", "-analyze_single_delete=false", + "-analyze_range_delete=false", "-analyze_iterator=false", + "-analyze_multiget=false"}; + paras.push_back("-output_dir=" + output_path); + paras.push_back("-trace_path=" + trace_path); + paras.push_back("-key_space_dir=" + test_path_); + AnalyzeTrace(paras, output_path, trace_path); + + // check the key_stats file + std::vector<std::string> k_stats = {"0 10 0 1 1.000000"}; + file_path = output_path + "/test-delete-0-accessed_key_stats.txt"; + CheckFileContent(k_stats, file_path, true); + + // Check the access count distribution + std::vector<std::string> k_dist = {"access_count: 1 num: 1"}; + file_path = + output_path + "/test-delete-0-accessed_key_count_distribution.txt"; + CheckFileContent(k_dist, file_path, true); + + // Check the trace sequence + std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8", + "8", "8", "8", "8", "8", "8", + "8", "8", "0", "6", "7", "0"}; + file_path = output_path + "/test-human_readable_trace.txt"; + CheckFileContent(k_sequence, file_path, false); + + // Check the prefix + std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"}; + file_path = output_path + "/test-delete-0-accessed_key_prefix_cut.txt"; + CheckFileContent(k_prefix, file_path, true); + + // Check the time series + std::vector<std::string> k_series = {"2 1533000630 0"}; + file_path = output_path + "/test-delete-0-time_series.txt"; + CheckFileContent(k_series, file_path, false); + + // Check the accessed key in whole key space + std::vector<std::string> k_whole_access = {"2 1"}; + file_path = output_path + "/test-delete-0-whole_key_stats.txt"; + CheckFileContent(k_whole_access, file_path, true); + + // Check the whole key prefix cut + std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63", + "3 0x64", "4 0x65", "5 0x66"}; + file_path = output_path + "/test-delete-0-whole_key_prefix_cut.txt"; + CheckFileContent(k_whole_prefix, file_path, true); + + /* + // Check the overall qps + std::vector<std::string> all_qps = {"0 0 1 0 0 0 0 0 0 1"}; + file_path = output_path + "/test-qps_stats.txt"; + CheckFileContent(all_qps, file_path, true); + + // Check the qps of Delete + std::vector<std::string> get_qps = {"1"}; + file_path = output_path + "/test-delete-0-qps_stats.txt"; + CheckFileContent(get_qps, file_path, true); + + // Check the top k qps prefix cut + std::vector<std::string> top_qps = {"At time: 0 with QPS: 1", + "The prefix: 0x63 Access count: 1"}; + file_path = output_path + "/test-delete-0-accessed_top_k_qps_prefix_cut.txt"; + CheckFileContent(top_qps, file_path, true); + */ +} + +// Test analyzing of Merge +TEST_F(TraceAnalyzerTest, Merge) { + std::string trace_path = test_path_ + "/trace"; + std::string output_path = test_path_ + "/merge"; + std::string file_path; + std::vector<std::string> paras = { + "-analyze_get=false", "-analyze_put=false", + "-analyze_delete=false", "-analyze_merge=true", + "-analyze_single_delete=false", "-analyze_range_delete=false", + "-analyze_iterator=false", "-analyze_multiget=false"}; + paras.push_back("-output_dir=" + output_path); + paras.push_back("-trace_path=" + trace_path); + paras.push_back("-key_space_dir=" + test_path_); + AnalyzeTrace(paras, output_path, trace_path); + + // check the key_stats file + std::vector<std::string> k_stats = {"0 20 0 1 1.000000"}; + file_path = output_path + "/test-merge-0-accessed_key_stats.txt"; + CheckFileContent(k_stats, file_path, true); + + // Check the access count distribution + std::vector<std::string> k_dist = {"access_count: 1 num: 1"}; + file_path = output_path + "/test-merge-0-accessed_key_count_distribution.txt"; + CheckFileContent(k_dist, file_path, true); + + // Check the trace sequence + std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8", + "8", "8", "8", "8", "8", "8", + "8", "8", "0", "6", "7", "0"}; + file_path = output_path + "/test-human_readable_trace.txt"; + CheckFileContent(k_sequence, file_path, false); + + // Check the prefix + std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"}; + file_path = output_path + "/test-merge-0-accessed_key_prefix_cut.txt"; + CheckFileContent(k_prefix, file_path, true); + + // Check the time series + std::vector<std::string> k_series = {"5 1533000630 0"}; + file_path = output_path + "/test-merge-0-time_series.txt"; + CheckFileContent(k_series, file_path, false); + + // Check the accessed key in whole key space + std::vector<std::string> k_whole_access = {"1 1"}; + file_path = output_path + "/test-merge-0-whole_key_stats.txt"; + CheckFileContent(k_whole_access, file_path, true); + + // Check the whole key prefix cut + std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63", + "3 0x64", "4 0x65", "5 0x66"}; + file_path = output_path + "/test-merge-0-whole_key_prefix_cut.txt"; + CheckFileContent(k_whole_prefix, file_path, true); + + /* + // Check the overall qps + std::vector<std::string> all_qps = {"0 0 0 0 0 1 0 0 0 1"}; + file_path = output_path + "/test-qps_stats.txt"; + CheckFileContent(all_qps, file_path, true); + + // Check the qps of Merge + std::vector<std::string> get_qps = {"1"}; + file_path = output_path + "/test-merge-0-qps_stats.txt"; + CheckFileContent(get_qps, file_path, true); + + // Check the top k qps prefix cut + std::vector<std::string> top_qps = {"At time: 0 with QPS: 1", + "The prefix: 0x62 Access count: 1"}; + file_path = output_path + "/test-merge-0-accessed_top_k_qps_prefix_cut.txt"; + CheckFileContent(top_qps, file_path, true); + */ + + // Check the value size distribution + std::vector<std::string> value_dist = { + "Number_of_value_size_between 0 and 24 is: 1"}; + file_path = + output_path + "/test-merge-0-accessed_value_size_distribution.txt"; + CheckFileContent(value_dist, file_path, true); +} + +// Test analyzing of SingleDelete +TEST_F(TraceAnalyzerTest, SingleDelete) { + std::string trace_path = test_path_ + "/trace"; + std::string output_path = test_path_ + "/single_delete"; + std::string file_path; + std::vector<std::string> paras = { + "-analyze_get=false", "-analyze_put=false", + "-analyze_delete=false", "-analyze_merge=false", + "-analyze_single_delete=true", "-analyze_range_delete=false", + "-analyze_iterator=false", "-analyze_multiget=false"}; + paras.push_back("-output_dir=" + output_path); + paras.push_back("-trace_path=" + trace_path); + paras.push_back("-key_space_dir=" + test_path_); + AnalyzeTrace(paras, output_path, trace_path); + + // check the key_stats file + std::vector<std::string> k_stats = {"0 10 0 1 1.000000"}; + file_path = output_path + "/test-single_delete-0-accessed_key_stats.txt"; + CheckFileContent(k_stats, file_path, true); + + // Check the access count distribution + std::vector<std::string> k_dist = {"access_count: 1 num: 1"}; + file_path = + output_path + "/test-single_delete-0-accessed_key_count_distribution.txt"; + CheckFileContent(k_dist, file_path, true); + + // Check the trace sequence + std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8", + "8", "8", "8", "8", "8", "8", + "8", "8", "0", "6", "7", "0"}; + file_path = output_path + "/test-human_readable_trace.txt"; + CheckFileContent(k_sequence, file_path, false); + + // Check the prefix + std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"}; + file_path = output_path + "/test-single_delete-0-accessed_key_prefix_cut.txt"; + CheckFileContent(k_prefix, file_path, true); + + // Check the time series + std::vector<std::string> k_series = {"3 1533000630 0"}; + file_path = output_path + "/test-single_delete-0-time_series.txt"; + CheckFileContent(k_series, file_path, false); + + // Check the accessed key in whole key space + std::vector<std::string> k_whole_access = {"3 1"}; + file_path = output_path + "/test-single_delete-0-whole_key_stats.txt"; + CheckFileContent(k_whole_access, file_path, true); + + // Check the whole key prefix cut + std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63", + "3 0x64", "4 0x65", "5 0x66"}; + file_path = output_path + "/test-single_delete-0-whole_key_prefix_cut.txt"; + CheckFileContent(k_whole_prefix, file_path, true); + + /* + // Check the overall qps + std::vector<std::string> all_qps = {"0 0 0 1 0 0 0 0 0 1"}; + file_path = output_path + "/test-qps_stats.txt"; + CheckFileContent(all_qps, file_path, true); + + // Check the qps of SingleDelete + std::vector<std::string> get_qps = {"1"}; + file_path = output_path + "/test-single_delete-0-qps_stats.txt"; + CheckFileContent(get_qps, file_path, true); + + // Check the top k qps prefix cut + std::vector<std::string> top_qps = {"At time: 0 with QPS: 1", + "The prefix: 0x64 Access count: 1"}; + file_path = + output_path + "/test-single_delete-0-accessed_top_k_qps_prefix_cut.txt"; + CheckFileContent(top_qps, file_path, true); + */ +} + +// Test analyzing of delete +TEST_F(TraceAnalyzerTest, DeleteRange) { + std::string trace_path = test_path_ + "/trace"; + std::string output_path = test_path_ + "/range_delete"; + std::string file_path; + std::vector<std::string> paras = { + "-analyze_get=false", "-analyze_put=false", + "-analyze_delete=false", "-analyze_merge=false", + "-analyze_single_delete=false", "-analyze_range_delete=true", + "-analyze_iterator=false", "-analyze_multiget=false"}; + paras.push_back("-output_dir=" + output_path); + paras.push_back("-trace_path=" + trace_path); + paras.push_back("-key_space_dir=" + test_path_); + AnalyzeTrace(paras, output_path, trace_path); + + // check the key_stats file + std::vector<std::string> k_stats = {"0 10 0 1 1.000000", "0 10 1 1 1.000000"}; + file_path = output_path + "/test-range_delete-0-accessed_key_stats.txt"; + CheckFileContent(k_stats, file_path, true); + + // Check the access count distribution + std::vector<std::string> k_dist = {"access_count: 1 num: 2"}; + file_path = + output_path + "/test-range_delete-0-accessed_key_count_distribution.txt"; + CheckFileContent(k_dist, file_path, true); + + // Check the trace sequence + std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8", + "8", "8", "8", "8", "8", "8", + "8", "8", "0", "6", "7", "0"}; + file_path = output_path + "/test-human_readable_trace.txt"; + CheckFileContent(k_sequence, file_path, false); + + // Check the prefix + std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30", + "1 1 1 1.000000 1.000000 0x65"}; + file_path = output_path + "/test-range_delete-0-accessed_key_prefix_cut.txt"; + CheckFileContent(k_prefix, file_path, true); + + // Check the time series + std::vector<std::string> k_series = {"4 1533000630 0", "4 1533060100 1"}; + file_path = output_path + "/test-range_delete-0-time_series.txt"; + CheckFileContent(k_series, file_path, false); + + // Check the accessed key in whole key space + std::vector<std::string> k_whole_access = {"4 1", "5 1"}; + file_path = output_path + "/test-range_delete-0-whole_key_stats.txt"; + CheckFileContent(k_whole_access, file_path, true); + + // Check the whole key prefix cut + std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63", + "3 0x64", "4 0x65", "5 0x66"}; + file_path = output_path + "/test-range_delete-0-whole_key_prefix_cut.txt"; + CheckFileContent(k_whole_prefix, file_path, true); + + /* + // Check the overall qps + std::vector<std::string> all_qps = {"0 0 0 0 2 0 0 0 0 2"}; + file_path = output_path + "/test-qps_stats.txt"; + CheckFileContent(all_qps, file_path, true); + + // Check the qps of DeleteRange + std::vector<std::string> get_qps = {"2"}; + file_path = output_path + "/test-range_delete-0-qps_stats.txt"; + CheckFileContent(get_qps, file_path, true); + + // Check the top k qps prefix cut + std::vector<std::string> top_qps = {"At time: 0 with QPS: 2", + "The prefix: 0x65 Access count: 1", + "The prefix: 0x66 Access count: 1"}; + file_path = + output_path + "/test-range_delete-0-accessed_top_k_qps_prefix_cut.txt"; + CheckFileContent(top_qps, file_path, true); + */ +} + +// Test analyzing of Iterator +TEST_F(TraceAnalyzerTest, Iterator) { + std::string trace_path = test_path_ + "/trace"; + std::string output_path = test_path_ + "/iterator"; + std::string file_path; + std::vector<std::string> paras = { + "-analyze_get=false", "-analyze_put=false", + "-analyze_delete=false", "-analyze_merge=false", + "-analyze_single_delete=false", "-analyze_range_delete=false", + "-analyze_iterator=true", "-analyze_multiget=false"}; + paras.push_back("-output_dir=" + output_path); + paras.push_back("-trace_path=" + trace_path); + paras.push_back("-key_space_dir=" + test_path_); + AnalyzeTrace(paras, output_path, trace_path); + + // Check the output of Seek + // check the key_stats file + std::vector<std::string> k_stats = {"0 10 0 1 1.000000"}; + file_path = output_path + "/test-iterator_Seek-0-accessed_key_stats.txt"; + CheckFileContent(k_stats, file_path, true); + + // Check the access count distribution + std::vector<std::string> k_dist = {"access_count: 1 num: 1"}; + file_path = + output_path + "/test-iterator_Seek-0-accessed_key_count_distribution.txt"; + CheckFileContent(k_dist, file_path, true); + + // Check the trace sequence + std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8", + "8", "8", "8", "8", "8", "8", + "8", "8", "0", "6", "7", "0"}; + file_path = output_path + "/test-human_readable_trace.txt"; + CheckFileContent(k_sequence, file_path, false); + + // Check the prefix + std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"}; + file_path = output_path + "/test-iterator_Seek-0-accessed_key_prefix_cut.txt"; + CheckFileContent(k_prefix, file_path, true); + + // Check the time series + std::vector<std::string> k_series = {"6 1 0"}; + file_path = output_path + "/test-iterator_Seek-0-time_series.txt"; + CheckFileContent(k_series, file_path, false); + + // Check the accessed key in whole key space + std::vector<std::string> k_whole_access = {"0 1"}; + file_path = output_path + "/test-iterator_Seek-0-whole_key_stats.txt"; + CheckFileContent(k_whole_access, file_path, true); + + // Check the whole key prefix cut + std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63", + "3 0x64", "4 0x65", "5 0x66"}; + file_path = output_path + "/test-iterator_Seek-0-whole_key_prefix_cut.txt"; + CheckFileContent(k_whole_prefix, file_path, true); + + /* + // Check the overall qps + std::vector<std::string> all_qps = {"0 0 0 0 0 0 1 1 0 2"}; + file_path = output_path + "/test-qps_stats.txt"; + CheckFileContent(all_qps, file_path, true); + + // Check the qps of Iterator_Seek + std::vector<std::string> get_qps = {"1"}; + file_path = output_path + "/test-iterator_Seek-0-qps_stats.txt"; + CheckFileContent(get_qps, file_path, true); + + // Check the top k qps prefix cut + std::vector<std::string> top_qps = {"At time: 0 with QPS: 1", + "The prefix: 0x61 Access count: 1"}; + file_path = + output_path + "/test-iterator_Seek-0-accessed_top_k_qps_prefix_cut.txt"; + CheckFileContent(top_qps, file_path, true); + */ + + // Check the output of SeekForPrev + // check the key_stats file + k_stats = {"0 10 0 1 1.000000"}; + file_path = + output_path + "/test-iterator_SeekForPrev-0-accessed_key_stats.txt"; + CheckFileContent(k_stats, file_path, true); + + // Check the access count distribution + k_dist = {"access_count: 1 num: 1"}; + file_path = + output_path + + "/test-iterator_SeekForPrev-0-accessed_key_count_distribution.txt"; + CheckFileContent(k_dist, file_path, true); + + // Check the prefix + k_prefix = {"0 0 0 0.000000 0.000000 0x30"}; + file_path = + output_path + "/test-iterator_SeekForPrev-0-accessed_key_prefix_cut.txt"; + CheckFileContent(k_prefix, file_path, true); + + // Check the time series + k_series = {"7 0 0"}; + file_path = output_path + "/test-iterator_SeekForPrev-0-time_series.txt"; + CheckFileContent(k_series, file_path, false); + + // Check the accessed key in whole key space + k_whole_access = {"1 1"}; + file_path = output_path + "/test-iterator_SeekForPrev-0-whole_key_stats.txt"; + CheckFileContent(k_whole_access, file_path, true); + + // Check the whole key prefix cut + k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63", "3 0x64", "4 0x65", "5 0x66"}; + file_path = + output_path + "/test-iterator_SeekForPrev-0-whole_key_prefix_cut.txt"; + CheckFileContent(k_whole_prefix, file_path, true); + + /* + // Check the qps of Iterator_SeekForPrev + get_qps = {"1"}; + file_path = output_path + "/test-iterator_SeekForPrev-0-qps_stats.txt"; + CheckFileContent(get_qps, file_path, true); + + // Check the top k qps prefix cut + top_qps = {"At time: 0 with QPS: 1", "The prefix: 0x62 Access count: 1"}; + file_path = output_path + + "/test-iterator_SeekForPrev-0-accessed_top_k_qps_prefix_cut.txt"; + CheckFileContent(top_qps, file_path, true); + */ +} + +// Test analyzing of multiget +TEST_F(TraceAnalyzerTest, MultiGet) { + std::string trace_path = test_path_ + "/trace"; + std::string output_path = test_path_ + "/multiget"; + std::string file_path; + std::vector<std::string> paras = { + "-analyze_get=false", "-analyze_put=false", + "-analyze_delete=false", "-analyze_merge=false", + "-analyze_single_delete=false", "-analyze_range_delete=true", + "-analyze_iterator=false", "-analyze_multiget=true"}; + paras.push_back("-output_dir=" + output_path); + paras.push_back("-trace_path=" + trace_path); + paras.push_back("-key_space_dir=" + test_path_); + AnalyzeTrace(paras, output_path, trace_path); + + // check the key_stats file + std::vector<std::string> k_stats = {"0 10 0 2 1.000000", "0 10 1 2 1.000000", + "0 10 2 1 1.000000", "0 10 3 2 1.000000", + "0 10 4 2 1.000000"}; + file_path = output_path + "/test-multiget-0-accessed_key_stats.txt"; + CheckFileContent(k_stats, file_path, true); + + // Check the access count distribution + std::vector<std::string> k_dist = {"access_count: 1 num: 1", + "access_count: 2 num: 4"}; + file_path = + output_path + "/test-multiget-0-accessed_key_count_distribution.txt"; + CheckFileContent(k_dist, file_path, true); + + // Check the trace sequence + std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8", + "8", "8", "8", "8", "8", "8", + "8", "8", "0", "6", "7", "0"}; + file_path = output_path + "/test-human_readable_trace.txt"; + CheckFileContent(k_sequence, file_path, false); + + // Check the prefix + std::vector<std::string> k_prefix = { + "0 0 0 0.000000 0.000000 0x30", "1 2 1 2.000000 1.000000 0x61", + "2 2 1 2.000000 1.000000 0x62", "3 1 1 1.000000 1.000000 0x64", + "4 2 1 2.000000 1.000000 0x67"}; + file_path = output_path + "/test-multiget-0-accessed_key_prefix_cut.txt"; + CheckFileContent(k_prefix, file_path, true); + + // Check the time series + std::vector<std::string> k_series = {"8 0 0", "8 0 1", "8 0 2", + "8 0 3", "8 0 4", "8 0 0", + "8 0 1", "8 0 3", "8 0 4"}; + file_path = output_path + "/test-multiget-0-time_series.txt"; + CheckFileContent(k_series, file_path, false); + + // Check the accessed key in whole key space + std::vector<std::string> k_whole_access = {"0 2", "1 2"}; + file_path = output_path + "/test-multiget-0-whole_key_stats.txt"; + CheckFileContent(k_whole_access, file_path, true); + + // Check the whole key prefix cut + std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63", + "3 0x64", "4 0x65", "5 0x66"}; + file_path = output_path + "/test-multiget-0-whole_key_prefix_cut.txt"; + CheckFileContent(k_whole_prefix, file_path, true); + + /* + // Check the overall qps. We have 3 MultiGet queries and it requested 9 keys + // in total + std::vector<std::string> all_qps = {"0 0 0 0 2 0 0 0 9 11"}; + file_path = output_path + "/test-qps_stats.txt"; + CheckFileContent(all_qps, file_path, true); + + // Check the qps of DeleteRange + std::vector<std::string> get_qps = {"9"}; + file_path = output_path + "/test-multiget-0-qps_stats.txt"; + CheckFileContent(get_qps, file_path, true); + + // Check the top k qps prefix cut + std::vector<std::string> top_qps = { + "At time: 0 with QPS: 9", "The prefix: 0x61 Access count: 2", + "The prefix: 0x62 Access count: 2", "The prefix: 0x64 Access count: 1", + "The prefix: 0x67 Access count: 2", "The prefix: 0x68 Access count: 2"}; + file_path = + output_path + "/test-multiget-0-accessed_top_k_qps_prefix_cut.txt"; + CheckFileContent(top_qps, file_path, true); + */ +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +#endif // GFLAG +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "Trace_analyzer test is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE return RUN_ALL_TESTS(); diff --git a/src/rocksdb/tools/trace_analyzer_tool.cc b/src/rocksdb/tools/trace_analyzer_tool.cc new file mode 100644 index 000000000..5a6d67864 --- /dev/null +++ b/src/rocksdb/tools/trace_analyzer_tool.cc @@ -0,0 +1,1925 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#ifndef ROCKSDB_LITE + +#ifdef GFLAGS +#ifdef NUMA +#include <numa.h> +#endif +#ifndef OS_WIN +#include <unistd.h> +#endif +#include <cinttypes> +#include <cmath> +#include <cstdio> +#include <cstdlib> +#include <memory> +#include <sstream> +#include <stdexcept> + +#include "db/db_impl/db_impl.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "env/composite_env_wrapper.h" +#include "file/line_file_reader.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/status.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/utilities/ldb_cmd.h" +#include "rocksdb/write_batch.h" +#include "table/meta_blocks.h" +#include "table/table_reader.h" +#include "tools/trace_analyzer_tool.h" +#include "trace_replay/trace_replay.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/gflags_compat.h" +#include "util/random.h" +#include "util/string_util.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_string(trace_path, "", "The trace file path."); +DEFINE_string(output_dir, "", "The directory to store the output files."); +DEFINE_string(output_prefix, "trace", + "The prefix used for all the output files."); +DEFINE_bool(output_key_stats, false, + "Output the key access count statistics to file\n" + "for accessed keys:\n" + "file name: <prefix>-<query_type>-<cf_id>-accessed_key_stats.txt\n" + "Format:[cf_id value_size access_keyid access_count]\n" + "for the whole key space keys:\n" + "File name: <prefix>-<query_type>-<cf_id>-whole_key_stats.txt\n" + "Format:[whole_key_space_keyid access_count]"); +DEFINE_bool(output_access_count_stats, false, + "Output the access count distribution statistics to file.\n" + "File name: <prefix>-<query_type>-<cf_id>-accessed_" + "key_count_distribution.txt \n" + "Format:[access_count number_of_access_count]"); +DEFINE_bool(output_time_series, false, + "Output the access time in second of each key, " + "such that we can have the time series data of the queries \n" + "File name: <prefix>-<query_type>-<cf_id>-time_series.txt\n" + "Format:[type_id time_in_sec access_keyid]."); +DEFINE_bool(try_process_corrupted_trace, false, + "In default, trace_analyzer will exit if the trace file is " + "corrupted due to the unexpected tracing cases. If this option " + "is enabled, trace_analyzer will stop reading the trace file, " + "and start analyzing the read-in data."); +DEFINE_int32(output_prefix_cut, 0, + "The number of bytes as prefix to cut the keys.\n" + "If it is enabled, it will generate the following:\n" + "For accessed keys:\n" + "File name: <prefix>-<query_type>-<cf_id>-" + "accessed_key_prefix_cut.txt \n" + "Format:[acessed_keyid access_count_of_prefix " + "number_of_keys_in_prefix average_key_access " + "prefix_succ_ratio prefix]\n" + "For whole key space keys:\n" + "File name: <prefix>-<query_type>-<cf_id>" + "-whole_key_prefix_cut.txt\n" + "Format:[start_keyid_in_whole_keyspace prefix]\n" + "if 'output_qps_stats' and 'top_k' are enabled, it will output:\n" + "File name: <prefix>-<query_type>-<cf_id>" + "-accessed_top_k_qps_prefix_cut.txt\n" + "Format:[the_top_ith_qps_time QPS], [prefix qps_of_this_second]."); +DEFINE_bool(convert_to_human_readable_trace, false, + "Convert the binary trace file to a human readable txt file " + "for further processing. " + "This file will be extremely large " + "(similar size as the original binary trace file). " + "You can specify 'no_key' to reduce the size, if key is not " + "needed in the next step.\n" + "File name: <prefix>_human_readable_trace.txt\n" + "Format:[<key> type_id cf_id value_size time_in_micorsec]."); +DEFINE_bool(output_qps_stats, false, + "Output the query per second(qps) statistics \n" + "For the overall qps, it will contain all qps of each query type. " + "The time is started from the first trace record\n" + "File name: <prefix>_qps_stats.txt\n" + "Format: [qps_type_1 qps_type_2 ...... overall_qps]\n" + "For each cf and query, it will have its own qps output.\n" + "File name: <prefix>-<query_type>-<cf_id>_qps_stats.txt \n" + "Format:[query_count_in_this_second]."); +DEFINE_bool(no_print, false, "Do not print out any result"); +DEFINE_string( + print_correlation, "", + "intput format: [correlation pairs][.,.]\n" + "Output the query correlations between the pairs of query types " + "listed in the parameter, input should select the operations from:\n" + "get, put, delete, single_delete, rangle_delete, merge. No space " + "between the pairs separated by commar. Example: =[get,get]... " + "It will print out the number of pairs of 'A after B' and " + "the average time interval between the two query."); +DEFINE_string(key_space_dir, "", + "<the directory stores full key space files> \n" + "The key space files should be: <column family id>.txt"); +DEFINE_bool(analyze_get, false, "Analyze the Get query."); +DEFINE_bool(analyze_put, false, "Analyze the Put query."); +DEFINE_bool(analyze_delete, false, "Analyze the Delete query."); +DEFINE_bool(analyze_single_delete, false, "Analyze the SingleDelete query."); +DEFINE_bool(analyze_range_delete, false, "Analyze the DeleteRange query."); +DEFINE_bool(analyze_merge, false, "Analyze the Merge query."); +DEFINE_bool(analyze_iterator, false, + " Analyze the iterate query like Seek() and SeekForPrev()."); +DEFINE_bool(analyze_multiget, false, + " Analyze the MultiGet query. NOTE: for" + " MultiGet, we analyze each KV-pair read in one MultiGet query. " + "Therefore, the total queries and QPS are calculated based on " + "the number of KV-pairs being accessed not the number of MultiGet." + "It can be improved in the future if needed"); +DEFINE_bool(no_key, false, + " Does not output the key to the result files to make smaller."); +DEFINE_bool(print_overall_stats, true, + " Print the stats of the whole trace, " + "like total requests, keys, and etc."); +DEFINE_bool(output_key_distribution, false, "Print the key size distribution."); +DEFINE_bool( + output_value_distribution, false, + "Out put the value size distribution, only available for Put and Merge.\n" + "File name: <prefix>-<query_type>-<cf_id>" + "-accessed_value_size_distribution.txt\n" + "Format:[Number_of_value_size_between x and " + "x+value_interval is: <the count>]"); +DEFINE_int32(print_top_k_access, 1, + "<top K of the variables to be printed> " + "Print the top k accessed keys, top k accessed prefix " + "and etc."); +DEFINE_int32(output_ignore_count, 0, + "<threshold>, ignores the access count <= this value, " + "it will shorter the output."); +DEFINE_int32(value_interval, 8, + "To output the value distribution, we need to set the value " + "intervals and make the statistic of the value size distribution " + "in different intervals. The default is 8."); +DEFINE_double(sample_ratio, 1.0, + "If the trace size is extremely huge or user want to sample " + "the trace when analyzing, sample ratio can be set (0, 1.0]"); + +namespace ROCKSDB_NAMESPACE { + +const size_t kShadowValueSize = 10; + +std::map<std::string, int> taOptToIndex = { + {"get", 0}, {"put", 1}, + {"delete", 2}, {"single_delete", 3}, + {"range_delete", 4}, {"merge", 5}, + {"iterator_Seek", 6}, {"iterator_SeekForPrev", 7}, + {"multiget", 8}}; + +std::map<int, std::string> taIndexToOpt = { + {0, "get"}, {1, "put"}, + {2, "delete"}, {3, "single_delete"}, + {4, "range_delete"}, {5, "merge"}, + {6, "iterator_Seek"}, {7, "iterator_SeekForPrev"}, + {8, "multiget"}}; + +namespace { + +uint64_t MultiplyCheckOverflow(uint64_t op1, uint64_t op2) { + if (op1 == 0 || op2 == 0) { + return 0; + } + if (std::numeric_limits<uint64_t>::max() / op1 < op2) { + return op1; + } + return (op1 * op2); +} + +} // namespace + +// The default constructor of AnalyzerOptions +AnalyzerOptions::AnalyzerOptions() + : correlation_map(kTaTypeNum, std::vector<int>(kTaTypeNum, -1)) {} + +AnalyzerOptions::~AnalyzerOptions() {} + +void AnalyzerOptions::SparseCorrelationInput(const std::string& in_str) { + std::string cur = in_str; + if (cur.size() == 0) { + return; + } + while (!cur.empty()) { + if (cur.compare(0, 1, "[") != 0) { + fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str()); + exit(1); + } + std::string opt1, opt2; + std::size_t split = cur.find_first_of(","); + if (split != std::string::npos) { + opt1 = cur.substr(1, split - 1); + } else { + fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str()); + exit(1); + } + std::size_t end = cur.find_first_of("]"); + if (end != std::string::npos) { + opt2 = cur.substr(split + 1, end - split - 1); + } else { + fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str()); + exit(1); + } + cur = cur.substr(end + 1); + + if (taOptToIndex.find(opt1) != taOptToIndex.end() && + taOptToIndex.find(opt2) != taOptToIndex.end()) { + correlation_list.push_back( + std::make_pair(taOptToIndex[opt1], taOptToIndex[opt2])); + } else { + fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str()); + exit(1); + } + } + + int sequence = 0; + for (auto& it : correlation_list) { + correlation_map[it.first][it.second] = sequence; + sequence++; + } + return; +} + +// The trace statistic struct constructor +TraceStats::TraceStats() { + cf_id = 0; + cf_name = "0"; + a_count = 0; + a_key_id = 0; + a_key_size_sqsum = 0; + a_key_size_sum = 0; + a_key_mid = 0; + a_value_size_sqsum = 0; + a_value_size_sum = 0; + a_value_mid = 0; + a_peak_qps = 0; + a_ave_qps = 0.0; +} + +TraceStats::~TraceStats() {} + +// The trace analyzer constructor +TraceAnalyzer::TraceAnalyzer(std::string& trace_path, std::string& output_path, + AnalyzerOptions _analyzer_opts) + : write_batch_ts_(0), + trace_name_(trace_path), + output_path_(output_path), + analyzer_opts_(_analyzer_opts) { + ROCKSDB_NAMESPACE::EnvOptions env_options; + env_ = ROCKSDB_NAMESPACE::Env::Default(); + offset_ = 0; + total_requests_ = 0; + total_access_keys_ = 0; + total_gets_ = 0; + total_writes_ = 0; + total_seeks_ = 0; + total_seek_prevs_ = 0; + total_multigets_ = 0; + trace_create_time_ = 0; + begin_time_ = 0; + end_time_ = 0; + time_series_start_ = 0; + cur_time_sec_ = 0; + if (FLAGS_sample_ratio > 1.0 || FLAGS_sample_ratio <= 0) { + sample_max_ = 1; + } else { + sample_max_ = static_cast<uint32_t>(1.0 / FLAGS_sample_ratio); + } + + ta_.resize(kTaTypeNum); + ta_[0].type_name = "get"; + if (FLAGS_analyze_get) { + ta_[0].enabled = true; + } else { + ta_[0].enabled = false; + } + ta_[1].type_name = "put"; + if (FLAGS_analyze_put) { + ta_[1].enabled = true; + } else { + ta_[1].enabled = false; + } + ta_[2].type_name = "delete"; + if (FLAGS_analyze_delete) { + ta_[2].enabled = true; + } else { + ta_[2].enabled = false; + } + ta_[3].type_name = "single_delete"; + if (FLAGS_analyze_single_delete) { + ta_[3].enabled = true; + } else { + ta_[3].enabled = false; + } + ta_[4].type_name = "range_delete"; + if (FLAGS_analyze_range_delete) { + ta_[4].enabled = true; + } else { + ta_[4].enabled = false; + } + ta_[5].type_name = "merge"; + if (FLAGS_analyze_merge) { + ta_[5].enabled = true; + } else { + ta_[5].enabled = false; + } + ta_[6].type_name = "iterator_Seek"; + if (FLAGS_analyze_iterator) { + ta_[6].enabled = true; + } else { + ta_[6].enabled = false; + } + ta_[7].type_name = "iterator_SeekForPrev"; + if (FLAGS_analyze_iterator) { + ta_[7].enabled = true; + } else { + ta_[7].enabled = false; + } + ta_[8].type_name = "multiget"; + if (FLAGS_analyze_multiget) { + ta_[8].enabled = true; + } else { + ta_[8].enabled = false; + } + for (int i = 0; i < kTaTypeNum; i++) { + ta_[i].sample_count = 0; + } +} + +TraceAnalyzer::~TraceAnalyzer() {} + +// Prepare the processing +// Initiate the global trace reader and writer here +Status TraceAnalyzer::PrepareProcessing() { + Status s; + // Prepare the trace reader + if (trace_reader_ == nullptr) { + s = NewFileTraceReader(env_, env_options_, trace_name_, &trace_reader_); + } else { + s = trace_reader_->Reset(); + } + if (!s.ok()) { + return s; + } + + // Prepare and open the trace sequence file writer if needed + if (FLAGS_convert_to_human_readable_trace) { + std::string trace_sequence_name; + trace_sequence_name = + output_path_ + "/" + FLAGS_output_prefix + "-human_readable_trace.txt"; + s = env_->NewWritableFile(trace_sequence_name, &trace_sequence_f_, + env_options_); + if (!s.ok()) { + return s; + } + } + + // prepare the general QPS file writer + if (FLAGS_output_qps_stats) { + std::string qps_stats_name; + qps_stats_name = + output_path_ + "/" + FLAGS_output_prefix + "-qps_stats.txt"; + s = env_->NewWritableFile(qps_stats_name, &qps_f_, env_options_); + if (!s.ok()) { + return s; + } + + qps_stats_name = + output_path_ + "/" + FLAGS_output_prefix + "-cf_qps_stats.txt"; + s = env_->NewWritableFile(qps_stats_name, &cf_qps_f_, env_options_); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} + +Status TraceAnalyzer::ReadTraceHeader(Trace* header) { + assert(header != nullptr); + std::string encoded_trace; + // Read the trace head + Status s = trace_reader_->Read(&encoded_trace); + if (!s.ok()) { + return s; + } + + s = TracerHelper::DecodeTrace(encoded_trace, header); + + if (header->type != kTraceBegin) { + return Status::Corruption("Corrupted trace file. Incorrect header."); + } + if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) { + return Status::Corruption("Corrupted trace file. Incorrect magic."); + } + + return s; +} + +Status TraceAnalyzer::ReadTraceFooter(Trace* footer) { + assert(footer != nullptr); + Status s = ReadTraceRecord(footer); + if (!s.ok()) { + return s; + } + if (footer->type != kTraceEnd) { + return Status::Corruption("Corrupted trace file. Incorrect footer."); + } + return s; +} + +Status TraceAnalyzer::ReadTraceRecord(Trace* trace) { + assert(trace != nullptr); + std::string encoded_trace; + Status s = trace_reader_->Read(&encoded_trace); + if (!s.ok()) { + return s; + } + return TracerHelper::DecodeTrace(encoded_trace, trace); +} + +// process the trace itself and redirect the trace content +// to different operation type handler. With different race +// format, this function can be changed +Status TraceAnalyzer::StartProcessing() { + Status s; + Trace header; + s = ReadTraceHeader(&header); + if (!s.ok()) { + fprintf(stderr, "Cannot read the header\n"); + return s; + } + // Set the default trace file version as version 0.2 + int trace_file_version = 2; + s = TracerHelper::ParseTraceHeader(header, &trace_file_version, &db_version_); + if (!s.ok()) { + return s; + } + trace_create_time_ = header.ts; + if (FLAGS_output_time_series) { + time_series_start_ = header.ts; + } + + Trace trace; + std::unique_ptr<TraceRecord> record; + while (s.ok()) { + trace.reset(); + s = ReadTraceRecord(&trace); + if (!s.ok()) { + break; + } + + end_time_ = trace.ts; + if (trace.type == kTraceEnd) { + break; + } + // Do not count TraceEnd (if there is one) + total_requests_++; + + s = TracerHelper::DecodeTraceRecord(&trace, trace_file_version, &record); + if (s.IsNotSupported()) { + continue; + } + if (!s.ok()) { + return s; + } + s = record->Accept(this, nullptr); + if (!s.ok()) { + fprintf(stderr, "Cannot process the TraceRecord\n"); + return s; + } + } + if (s.IsIncomplete()) { + // Fix it: Reaching eof returns Incomplete status at the moment. + return Status::OK(); + } + return s; +} + +// After the trace is processed by StartProcessing, the statistic data +// is stored in the map or other in memory data structures. To get the +// other statistic result such as key size distribution, value size +// distribution, these data structures are re-processed here. +Status TraceAnalyzer::MakeStatistics() { + int ret; + Status s; + for (int type = 0; type < kTaTypeNum; type++) { + if (!ta_[type].enabled) { + continue; + } + for (auto& stat : ta_[type].stats) { + stat.second.a_key_id = 0; + for (auto& record : stat.second.a_key_stats) { + record.second.key_id = stat.second.a_key_id; + stat.second.a_key_id++; + if (record.second.access_count <= + static_cast<uint64_t>(FLAGS_output_ignore_count)) { + continue; + } + + // Generate the key access count distribution data + if (FLAGS_output_access_count_stats) { + if (stat.second.a_count_stats.find(record.second.access_count) == + stat.second.a_count_stats.end()) { + stat.second.a_count_stats[record.second.access_count] = 1; + } else { + stat.second.a_count_stats[record.second.access_count]++; + } + } + + // Generate the key size distribution data + if (FLAGS_output_key_distribution) { + if (stat.second.a_key_size_stats.find(record.first.size()) == + stat.second.a_key_size_stats.end()) { + stat.second.a_key_size_stats[record.first.size()] = 1; + } else { + stat.second.a_key_size_stats[record.first.size()]++; + } + } + + if (!FLAGS_print_correlation.empty()) { + s = MakeStatisticCorrelation(stat.second, record.second); + if (!s.ok()) { + return s; + } + } + } + + // Output the prefix cut or the whole content of the accessed key space + if (FLAGS_output_key_stats || FLAGS_output_prefix_cut > 0) { + s = MakeStatisticKeyStatsOrPrefix(stat.second); + if (!s.ok()) { + return s; + } + } + + // output the access count distribution + if (FLAGS_output_access_count_stats && stat.second.a_count_dist_f) { + for (auto& record : stat.second.a_count_stats) { + ret = snprintf(buffer_, sizeof(buffer_), + "access_count: %" PRIu64 " num: %" PRIu64 "\n", + record.first, record.second); + if (ret < 0) { + return Status::IOError("Format the output failed"); + } + std::string printout(buffer_); + s = stat.second.a_count_dist_f->Append(printout); + if (!s.ok()) { + fprintf(stderr, "Write access count distribution file failed\n"); + return s; + } + } + } + + // find the medium of the key size + uint64_t k_count = 0; + bool get_mid = false; + for (auto& record : stat.second.a_key_size_stats) { + k_count += record.second; + if (!get_mid && k_count >= stat.second.a_key_mid) { + stat.second.a_key_mid = record.first; + get_mid = true; + } + if (FLAGS_output_key_distribution && stat.second.a_key_size_f) { + ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 "\n", + record.first, record.second); + if (ret < 0) { + return Status::IOError("Format output failed"); + } + std::string printout(buffer_); + s = stat.second.a_key_size_f->Append(printout); + if (!s.ok()) { + fprintf(stderr, "Write key size distribution file failed\n"); + return s; + } + } + } + + // output the value size distribution + uint64_t v_begin = 0, v_end = 0, v_count = 0; + get_mid = false; + for (auto& record : stat.second.a_value_size_stats) { + v_begin = v_end; + v_end = (record.first + 1) * FLAGS_value_interval; + v_count += record.second; + if (!get_mid && v_count >= stat.second.a_count / 2) { + stat.second.a_value_mid = (v_begin + v_end) / 2; + get_mid = true; + } + if (FLAGS_output_value_distribution && stat.second.a_value_size_f && + (type == TraceOperationType::kPut || + type == TraceOperationType::kMerge)) { + ret = snprintf(buffer_, sizeof(buffer_), + "Number_of_value_size_between %" PRIu64 " and %" PRIu64 + " is: %" PRIu64 "\n", + v_begin, v_end, record.second); + if (ret < 0) { + return Status::IOError("Format output failed"); + } + std::string printout(buffer_); + s = stat.second.a_value_size_f->Append(printout); + if (!s.ok()) { + fprintf(stderr, "Write value size distribution file failed\n"); + return s; + } + } + } + } + } + + // Make the QPS statistics + if (FLAGS_output_qps_stats) { + s = MakeStatisticQPS(); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + +// Process the statistics of the key access and +// prefix of the accessed keys if required +Status TraceAnalyzer::MakeStatisticKeyStatsOrPrefix(TraceStats& stats) { + int ret; + Status s; + std::string prefix = "0"; + uint64_t prefix_access = 0; + uint64_t prefix_count = 0; + uint64_t prefix_succ_access = 0; + double prefix_ave_access = 0.0; + stats.a_succ_count = 0; + for (auto& record : stats.a_key_stats) { + // write the key access statistic file + if (!stats.a_key_f) { + return Status::IOError("Failed to open accessed_key_stats file."); + } + stats.a_succ_count += record.second.succ_count; + double succ_ratio = 0.0; + if (record.second.access_count > 0) { + succ_ratio = (static_cast<double>(record.second.succ_count)) / + record.second.access_count; + } + ret = snprintf(buffer_, sizeof(buffer_), + "%u %zu %" PRIu64 " %" PRIu64 " %f\n", record.second.cf_id, + record.second.value_size, record.second.key_id, + record.second.access_count, succ_ratio); + if (ret < 0) { + return Status::IOError("Format output failed"); + } + std::string printout(buffer_); + s = stats.a_key_f->Append(printout); + if (!s.ok()) { + fprintf(stderr, "Write key access file failed\n"); + return s; + } + + // write the prefix cut of the accessed keys + if (FLAGS_output_prefix_cut > 0 && stats.a_prefix_cut_f) { + if (record.first.compare(0, FLAGS_output_prefix_cut, prefix) != 0) { + std::string prefix_out = + ROCKSDB_NAMESPACE::LDBCommand::StringToHex(prefix); + if (prefix_count == 0) { + prefix_ave_access = 0.0; + } else { + prefix_ave_access = + (static_cast<double>(prefix_access)) / prefix_count; + } + double prefix_succ_ratio = 0.0; + if (prefix_access > 0) { + prefix_succ_ratio = + (static_cast<double>(prefix_succ_access)) / prefix_access; + } + ret = + snprintf(buffer_, sizeof(buffer_), + "%" PRIu64 " %" PRIu64 " %" PRIu64 " %f %f %s\n", + record.second.key_id, prefix_access, prefix_count, + prefix_ave_access, prefix_succ_ratio, prefix_out.c_str()); + if (ret < 0) { + return Status::IOError("Format output failed"); + } + std::string pout(buffer_); + s = stats.a_prefix_cut_f->Append(pout); + if (!s.ok()) { + fprintf(stderr, "Write accessed key prefix file failed\n"); + return s; + } + + // make the top k statistic for the prefix + if (static_cast<int32_t>(stats.top_k_prefix_access.size()) < + FLAGS_print_top_k_access) { + stats.top_k_prefix_access.push( + std::make_pair(prefix_access, prefix_out)); + } else { + if (prefix_access > stats.top_k_prefix_access.top().first) { + stats.top_k_prefix_access.pop(); + stats.top_k_prefix_access.push( + std::make_pair(prefix_access, prefix_out)); + } + } + + if (static_cast<int32_t>(stats.top_k_prefix_ave.size()) < + FLAGS_print_top_k_access) { + stats.top_k_prefix_ave.push( + std::make_pair(prefix_ave_access, prefix_out)); + } else { + if (prefix_ave_access > stats.top_k_prefix_ave.top().first) { + stats.top_k_prefix_ave.pop(); + stats.top_k_prefix_ave.push( + std::make_pair(prefix_ave_access, prefix_out)); + } + } + + prefix = record.first.substr(0, FLAGS_output_prefix_cut); + prefix_access = 0; + prefix_count = 0; + prefix_succ_access = 0; + } + prefix_access += record.second.access_count; + prefix_count += 1; + prefix_succ_access += record.second.succ_count; + } + } + return Status::OK(); +} + +// Process the statistics of different query type +// correlations +Status TraceAnalyzer::MakeStatisticCorrelation(TraceStats& stats, + StatsUnit& unit) { + if (stats.correlation_output.size() != + analyzer_opts_.correlation_list.size()) { + return Status::Corruption("Cannot make the statistic of correlation."); + } + + for (int i = 0; i < static_cast<int>(analyzer_opts_.correlation_list.size()); + i++) { + if (i >= static_cast<int>(stats.correlation_output.size()) || + i >= static_cast<int>(unit.v_correlation.size())) { + break; + } + stats.correlation_output[i].first += unit.v_correlation[i].count; + stats.correlation_output[i].second += unit.v_correlation[i].total_ts; + } + return Status::OK(); +} + +// Process the statistics of QPS +Status TraceAnalyzer::MakeStatisticQPS() { + if (begin_time_ == 0) { + begin_time_ = trace_create_time_; + } + uint32_t duration = + static_cast<uint32_t>((end_time_ - begin_time_) / 1000000); + int ret; + Status s; + std::vector<std::vector<uint32_t>> type_qps( + duration, std::vector<uint32_t>(kTaTypeNum + 1, 0)); + std::vector<uint64_t> qps_sum(kTaTypeNum + 1, 0); + std::vector<uint32_t> qps_peak(kTaTypeNum + 1, 0); + qps_ave_.resize(kTaTypeNum + 1); + + for (int type = 0; type < kTaTypeNum; type++) { + if (!ta_[type].enabled) { + continue; + } + for (auto& stat : ta_[type].stats) { + uint32_t time_line = 0; + uint64_t cf_qps_sum = 0; + for (auto& time_it : stat.second.a_qps_stats) { + if (time_it.first >= duration) { + continue; + } + type_qps[time_it.first][kTaTypeNum] += time_it.second; + type_qps[time_it.first][type] += time_it.second; + cf_qps_sum += time_it.second; + if (time_it.second > stat.second.a_peak_qps) { + stat.second.a_peak_qps = time_it.second; + } + if (stat.second.a_qps_f) { + while (time_line < time_it.first) { + ret = snprintf(buffer_, sizeof(buffer_), "%u\n", 0); + if (ret < 0) { + return Status::IOError("Format the output failed"); + } + std::string printout(buffer_); + s = stat.second.a_qps_f->Append(printout); + if (!s.ok()) { + fprintf(stderr, "Write QPS file failed\n"); + return s; + } + time_line++; + } + ret = snprintf(buffer_, sizeof(buffer_), "%u\n", time_it.second); + if (ret < 0) { + return Status::IOError("Format the output failed"); + } + std::string printout(buffer_); + s = stat.second.a_qps_f->Append(printout); + if (!s.ok()) { + fprintf(stderr, "Write QPS file failed\n"); + return s; + } + if (time_line == time_it.first) { + time_line++; + } + } + + // Process the top k QPS peaks + if (FLAGS_output_prefix_cut > 0) { + if (static_cast<int32_t>(stat.second.top_k_qps_sec.size()) < + FLAGS_print_top_k_access) { + stat.second.top_k_qps_sec.push( + std::make_pair(time_it.second, time_it.first)); + } else { + if (stat.second.top_k_qps_sec.size() > 0 && + stat.second.top_k_qps_sec.top().first < time_it.second) { + stat.second.top_k_qps_sec.pop(); + stat.second.top_k_qps_sec.push( + std::make_pair(time_it.second, time_it.first)); + } + } + } + } + if (duration == 0) { + stat.second.a_ave_qps = 0; + } else { + stat.second.a_ave_qps = (static_cast<double>(cf_qps_sum)) / duration; + } + + // Output the accessed unique key number change overtime + if (stat.second.a_key_num_f) { + uint64_t cur_uni_key = + static_cast<uint64_t>(stat.second.a_key_stats.size()); + double cur_ratio = 0.0; + uint64_t cur_num = 0; + for (uint32_t i = 0; i < duration; i++) { + auto find_time = stat.second.uni_key_num.find(i); + if (find_time != stat.second.uni_key_num.end()) { + cur_ratio = (static_cast<double>(find_time->second)) / cur_uni_key; + cur_num = find_time->second; + } + ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %.12f\n", + cur_num, cur_ratio); + if (ret < 0) { + return Status::IOError("Format the output failed"); + } + std::string printout(buffer_); + s = stat.second.a_key_num_f->Append(printout); + if (!s.ok()) { + fprintf(stderr, + "Write accessed unique key number change file failed\n"); + return s; + } + } + } + + // output the prefix of top k access peak + if (FLAGS_output_prefix_cut > 0 && stat.second.a_top_qps_prefix_f) { + while (!stat.second.top_k_qps_sec.empty()) { + ret = snprintf(buffer_, sizeof(buffer_), "At time: %u with QPS: %u\n", + stat.second.top_k_qps_sec.top().second, + stat.second.top_k_qps_sec.top().first); + if (ret < 0) { + return Status::IOError("Format the output failed"); + } + std::string printout(buffer_); + s = stat.second.a_top_qps_prefix_f->Append(printout); + if (!s.ok()) { + fprintf(stderr, "Write prefix QPS top K file failed\n"); + return s; + } + uint32_t qps_time = stat.second.top_k_qps_sec.top().second; + stat.second.top_k_qps_sec.pop(); + if (stat.second.a_qps_prefix_stats.find(qps_time) != + stat.second.a_qps_prefix_stats.end()) { + for (auto& qps_prefix : stat.second.a_qps_prefix_stats[qps_time]) { + std::string qps_prefix_out = + ROCKSDB_NAMESPACE::LDBCommand::StringToHex(qps_prefix.first); + ret = snprintf(buffer_, sizeof(buffer_), + "The prefix: %s Access count: %u\n", + qps_prefix_out.c_str(), qps_prefix.second); + if (ret < 0) { + return Status::IOError("Format the output failed"); + } + std::string pout(buffer_); + s = stat.second.a_top_qps_prefix_f->Append(pout); + if (!s.ok()) { + fprintf(stderr, "Write prefix QPS top K file failed\n"); + return s; + } + } + } + } + } + } + } + + if (qps_f_) { + for (uint32_t i = 0; i < duration; i++) { + for (int type = 0; type <= kTaTypeNum; type++) { + if (type < kTaTypeNum) { + ret = snprintf(buffer_, sizeof(buffer_), "%u ", type_qps[i][type]); + } else { + ret = snprintf(buffer_, sizeof(buffer_), "%u\n", type_qps[i][type]); + } + if (ret < 0) { + return Status::IOError("Format the output failed"); + } + std::string printout(buffer_); + s = qps_f_->Append(printout); + if (!s.ok()) { + return s; + } + qps_sum[type] += type_qps[i][type]; + if (type_qps[i][type] > qps_peak[type]) { + qps_peak[type] = type_qps[i][type]; + } + } + } + } + + if (cf_qps_f_) { + int cfs_size = static_cast<uint32_t>(cfs_.size()); + uint32_t v; + for (uint32_t i = 0; i < duration; i++) { + for (int cf = 0; cf < cfs_size; cf++) { + if (cfs_[cf].cf_qps.find(i) != cfs_[cf].cf_qps.end()) { + v = cfs_[cf].cf_qps[i]; + } else { + v = 0; + } + if (cf < cfs_size - 1) { + ret = snprintf(buffer_, sizeof(buffer_), "%u ", v); + } else { + ret = snprintf(buffer_, sizeof(buffer_), "%u\n", v); + } + if (ret < 0) { + return Status::IOError("Format the output failed"); + } + std::string printout(buffer_); + s = cf_qps_f_->Append(printout); + if (!s.ok()) { + return s; + } + } + } + } + + qps_peak_ = qps_peak; + for (int type = 0; type <= kTaTypeNum; type++) { + if (duration == 0) { + qps_ave_[type] = 0; + } else { + qps_ave_[type] = (static_cast<double>(qps_sum[type])) / duration; + } + } + + return Status::OK(); +} + +// In reprocessing, if we have the whole key space +// we can output the access count of all keys in a cf +// we can make some statistics of the whole key space +// also, we output the top k accessed keys here +Status TraceAnalyzer::ReProcessing() { + int ret; + Status s; + for (auto& cf_it : cfs_) { + uint32_t cf_id = cf_it.first; + + // output the time series; + if (FLAGS_output_time_series) { + for (int type = 0; type < kTaTypeNum; type++) { + if (!ta_[type].enabled || + ta_[type].stats.find(cf_id) == ta_[type].stats.end()) { + continue; + } + TraceStats& stat = ta_[type].stats[cf_id]; + if (!stat.time_series_f) { + fprintf(stderr, "Cannot write time_series of '%s' in '%u'\n", + ta_[type].type_name.c_str(), cf_id); + continue; + } + while (!stat.time_series.empty()) { + uint64_t key_id = 0; + auto found = stat.a_key_stats.find(stat.time_series.front().key); + if (found != stat.a_key_stats.end()) { + key_id = found->second.key_id; + } + ret = + snprintf(buffer_, sizeof(buffer_), "%u %" PRIu64 " %" PRIu64 "\n", + stat.time_series.front().type, + stat.time_series.front().ts, key_id); + if (ret < 0) { + return Status::IOError("Format the output failed"); + } + std::string printout(buffer_); + s = stat.time_series_f->Append(printout); + if (!s.ok()) { + fprintf(stderr, "Write time series file failed\n"); + return s; + } + stat.time_series.pop_front(); + } + } + } + + // process the whole key space if needed + if (!FLAGS_key_space_dir.empty()) { + std::string whole_key_path = + FLAGS_key_space_dir + "/" + std::to_string(cf_id) + ".txt"; + std::string input_key, get_key; + std::vector<std::string> prefix(kTaTypeNum); + std::unique_ptr<FSSequentialFile> file; + + s = env_->GetFileSystem()->NewSequentialFile( + whole_key_path, FileOptions(env_options_), &file, nullptr); + if (!s.ok()) { + fprintf(stderr, "Cannot open the whole key space file of CF: %u\n", + cf_id); + file.reset(); + } + + if (file) { + size_t kTraceFileReadaheadSize = 2 * 1024 * 1024; + LineFileReader lf_reader( + std::move(file), whole_key_path, + kTraceFileReadaheadSize /* filereadahead_size */); + for (cfs_[cf_id].w_count = 0; lf_reader.ReadLine( + &get_key, Env::IO_TOTAL /* rate_limiter_priority */); + ++cfs_[cf_id].w_count) { + input_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(get_key); + for (int type = 0; type < kTaTypeNum; type++) { + if (!ta_[type].enabled) { + continue; + } + TraceStats& stat = ta_[type].stats[cf_id]; + if (stat.w_key_f) { + if (stat.a_key_stats.find(input_key) != stat.a_key_stats.end()) { + ret = snprintf(buffer_, sizeof(buffer_), + "%" PRIu64 " %" PRIu64 "\n", cfs_[cf_id].w_count, + stat.a_key_stats[input_key].access_count); + if (ret < 0) { + return Status::IOError("Format the output failed"); + } + std::string printout(buffer_); + s = stat.w_key_f->Append(printout); + if (!s.ok()) { + fprintf(stderr, "Write whole key space access file failed\n"); + return s; + } + } + } + + // Output the prefix cut file of the whole key space + if (FLAGS_output_prefix_cut > 0 && stat.w_prefix_cut_f) { + if (input_key.compare(0, FLAGS_output_prefix_cut, prefix[type]) != + 0) { + prefix[type] = input_key.substr(0, FLAGS_output_prefix_cut); + std::string prefix_out = + ROCKSDB_NAMESPACE::LDBCommand::StringToHex(prefix[type]); + ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %s\n", + cfs_[cf_id].w_count, prefix_out.c_str()); + if (ret < 0) { + return Status::IOError("Format the output failed"); + } + std::string printout(buffer_); + s = stat.w_prefix_cut_f->Append(printout); + if (!s.ok()) { + fprintf(stderr, + "Write whole key space prefix cut file failed\n"); + return s; + } + } + } + } + + // Make the statistics fo the key size distribution + if (FLAGS_output_key_distribution) { + if (cfs_[cf_id].w_key_size_stats.find(input_key.size()) == + cfs_[cf_id].w_key_size_stats.end()) { + cfs_[cf_id].w_key_size_stats[input_key.size()] = 1; + } else { + cfs_[cf_id].w_key_size_stats[input_key.size()]++; + } + } + } + s = lf_reader.GetStatus(); + if (!s.ok()) { + fprintf(stderr, "Read whole key space file failed\n"); + return s; + } + } + } + + // process the top k accessed keys + if (FLAGS_print_top_k_access > 0) { + for (int type = 0; type < kTaTypeNum; type++) { + if (!ta_[type].enabled || + ta_[type].stats.find(cf_id) == ta_[type].stats.end()) { + continue; + } + TraceStats& stat = ta_[type].stats[cf_id]; + for (auto& record : stat.a_key_stats) { + if (static_cast<int32_t>(stat.top_k_queue.size()) < + FLAGS_print_top_k_access) { + stat.top_k_queue.push( + std::make_pair(record.second.access_count, record.first)); + } else { + if (record.second.access_count > stat.top_k_queue.top().first) { + stat.top_k_queue.pop(); + stat.top_k_queue.push( + std::make_pair(record.second.access_count, record.first)); + } + } + } + } + } + } + return Status::OK(); +} + +// End the processing, print the requested results +Status TraceAnalyzer::EndProcessing() { + Status s; + if (trace_sequence_f_) { + s = trace_sequence_f_->Close(); + } + if (FLAGS_no_print) { + return s; + } + PrintStatistics(); + if (s.ok()) { + s = CloseOutputFiles(); + } + return s; +} + +// Insert the corresponding key statistics to the correct type +// and correct CF, output the time-series file if needed +Status TraceAnalyzer::KeyStatsInsertion(const uint32_t& type, + const uint32_t& cf_id, + const std::string& key, + const size_t value_size, + const uint64_t ts) { + Status s; + StatsUnit unit; + unit.key_id = 0; + unit.cf_id = cf_id; + unit.value_size = value_size; + unit.access_count = 1; + unit.latest_ts = ts; + if ((type != TraceOperationType::kGet && + type != TraceOperationType::kMultiGet) || + value_size > 0) { + unit.succ_count = 1; + } else { + unit.succ_count = 0; + } + unit.v_correlation.resize(analyzer_opts_.correlation_list.size()); + for (int i = 0; + i < (static_cast<int>(analyzer_opts_.correlation_list.size())); i++) { + unit.v_correlation[i].count = 0; + unit.v_correlation[i].total_ts = 0; + } + std::string prefix; + if (FLAGS_output_prefix_cut > 0) { + prefix = key.substr(0, FLAGS_output_prefix_cut); + } + + if (begin_time_ == 0) { + begin_time_ = ts; + } + uint32_t time_in_sec; + if (ts < begin_time_) { + time_in_sec = 0; + } else { + time_in_sec = static_cast<uint32_t>((ts - begin_time_) / 1000000); + } + + uint64_t dist_value_size = value_size / FLAGS_value_interval; + auto found_stats = ta_[type].stats.find(cf_id); + if (found_stats == ta_[type].stats.end()) { + ta_[type].stats[cf_id].cf_id = cf_id; + ta_[type].stats[cf_id].cf_name = std::to_string(cf_id); + ta_[type].stats[cf_id].a_count = 1; + ta_[type].stats[cf_id].a_key_id = 0; + ta_[type].stats[cf_id].a_key_size_sqsum = MultiplyCheckOverflow( + static_cast<uint64_t>(key.size()), static_cast<uint64_t>(key.size())); + ta_[type].stats[cf_id].a_key_size_sum = key.size(); + ta_[type].stats[cf_id].a_value_size_sqsum = MultiplyCheckOverflow( + static_cast<uint64_t>(value_size), static_cast<uint64_t>(value_size)); + ta_[type].stats[cf_id].a_value_size_sum = value_size; + s = OpenStatsOutputFiles(ta_[type].type_name, ta_[type].stats[cf_id]); + if (!FLAGS_print_correlation.empty()) { + s = StatsUnitCorrelationUpdate(unit, type, ts, key); + } + ta_[type].stats[cf_id].a_key_stats[key] = unit; + ta_[type].stats[cf_id].a_value_size_stats[dist_value_size] = 1; + ta_[type].stats[cf_id].a_qps_stats[time_in_sec] = 1; + ta_[type].stats[cf_id].correlation_output.resize( + analyzer_opts_.correlation_list.size()); + if (FLAGS_output_prefix_cut > 0) { + std::map<std::string, uint32_t> tmp_qps_map; + tmp_qps_map[prefix] = 1; + ta_[type].stats[cf_id].a_qps_prefix_stats[time_in_sec] = tmp_qps_map; + } + if (time_in_sec != cur_time_sec_) { + ta_[type].stats[cf_id].uni_key_num[cur_time_sec_] = + static_cast<uint64_t>(ta_[type].stats[cf_id].a_key_stats.size()); + cur_time_sec_ = time_in_sec; + } + } else { + found_stats->second.a_count++; + found_stats->second.a_key_size_sqsum += MultiplyCheckOverflow( + static_cast<uint64_t>(key.size()), static_cast<uint64_t>(key.size())); + found_stats->second.a_key_size_sum += key.size(); + found_stats->second.a_value_size_sqsum += MultiplyCheckOverflow( + static_cast<uint64_t>(value_size), static_cast<uint64_t>(value_size)); + found_stats->second.a_value_size_sum += value_size; + auto found_key = found_stats->second.a_key_stats.find(key); + if (found_key == found_stats->second.a_key_stats.end()) { + found_stats->second.a_key_stats[key] = unit; + } else { + found_key->second.access_count++; + if (type != TraceOperationType::kGet || value_size > 0) { + found_key->second.succ_count++; + } + if (!FLAGS_print_correlation.empty()) { + s = StatsUnitCorrelationUpdate(found_key->second, type, ts, key); + } + } + if (time_in_sec != cur_time_sec_) { + found_stats->second.uni_key_num[cur_time_sec_] = + static_cast<uint64_t>(found_stats->second.a_key_stats.size()); + cur_time_sec_ = time_in_sec; + } + + auto found_value = + found_stats->second.a_value_size_stats.find(dist_value_size); + if (found_value == found_stats->second.a_value_size_stats.end()) { + found_stats->second.a_value_size_stats[dist_value_size] = 1; + } else { + found_value->second++; + } + + auto found_qps = found_stats->second.a_qps_stats.find(time_in_sec); + if (found_qps == found_stats->second.a_qps_stats.end()) { + found_stats->second.a_qps_stats[time_in_sec] = 1; + } else { + found_qps->second++; + } + + if (FLAGS_output_prefix_cut > 0) { + auto found_qps_prefix = + found_stats->second.a_qps_prefix_stats.find(time_in_sec); + if (found_qps_prefix == found_stats->second.a_qps_prefix_stats.end()) { + std::map<std::string, uint32_t> tmp_qps_map; + found_stats->second.a_qps_prefix_stats[time_in_sec] = tmp_qps_map; + } + if (found_stats->second.a_qps_prefix_stats[time_in_sec].find(prefix) == + found_stats->second.a_qps_prefix_stats[time_in_sec].end()) { + found_stats->second.a_qps_prefix_stats[time_in_sec][prefix] = 1; + } else { + found_stats->second.a_qps_prefix_stats[time_in_sec][prefix]++; + } + } + } + + if (cfs_.find(cf_id) == cfs_.end()) { + CfUnit cf_unit; + cf_unit.cf_id = cf_id; + cf_unit.w_count = 0; + cf_unit.a_count = 0; + cfs_[cf_id] = cf_unit; + } + + if (FLAGS_output_qps_stats) { + cfs_[cf_id].cf_qps[time_in_sec]++; + } + + if (FLAGS_output_time_series) { + TraceUnit trace_u; + trace_u.type = type; + trace_u.key = key; + trace_u.value_size = value_size; + trace_u.ts = (ts - time_series_start_) / 1000000; + trace_u.cf_id = cf_id; + ta_[type].stats[cf_id].time_series.push_back(trace_u); + } + + return s; +} + +// Update the correlation unit of each key if enabled +Status TraceAnalyzer::StatsUnitCorrelationUpdate(StatsUnit& unit, + const uint32_t& type_second, + const uint64_t& ts, + const std::string& key) { + if (type_second >= kTaTypeNum) { + fprintf(stderr, "Unknown Type Id: %u\n", type_second); + return Status::NotFound(); + } + + for (int type_first = 0; type_first < kTaTypeNum; type_first++) { + if (type_first >= static_cast<int>(ta_.size()) || + type_first >= static_cast<int>(analyzer_opts_.correlation_map.size())) { + break; + } + if (analyzer_opts_.correlation_map[type_first][type_second] < 0 || + ta_[type_first].stats.find(unit.cf_id) == ta_[type_first].stats.end() || + ta_[type_first].stats[unit.cf_id].a_key_stats.find(key) == + ta_[type_first].stats[unit.cf_id].a_key_stats.end() || + ta_[type_first].stats[unit.cf_id].a_key_stats[key].latest_ts == ts) { + continue; + } + + int correlation_id = + analyzer_opts_.correlation_map[type_first][type_second]; + + // after get the x-y operation time or x, update; + if (correlation_id < 0 || + correlation_id >= static_cast<int>(unit.v_correlation.size())) { + continue; + } + unit.v_correlation[correlation_id].count++; + unit.v_correlation[correlation_id].total_ts += + (ts - ta_[type_first].stats[unit.cf_id].a_key_stats[key].latest_ts); + } + + unit.latest_ts = ts; + return Status::OK(); +} + +// when a new trace statistic is created, the file handler +// pointers should be initiated if needed according to +// the trace analyzer options +Status TraceAnalyzer::OpenStatsOutputFiles(const std::string& type, + TraceStats& new_stats) { + Status s; + if (FLAGS_output_key_stats) { + s = CreateOutputFile(type, new_stats.cf_name, "accessed_key_stats.txt", + &new_stats.a_key_f); + s = CreateOutputFile(type, new_stats.cf_name, + "accessed_unique_key_num_change.txt", + &new_stats.a_key_num_f); + if (!FLAGS_key_space_dir.empty()) { + s = CreateOutputFile(type, new_stats.cf_name, "whole_key_stats.txt", + &new_stats.w_key_f); + } + } + + if (FLAGS_output_access_count_stats) { + s = CreateOutputFile(type, new_stats.cf_name, + "accessed_key_count_distribution.txt", + &new_stats.a_count_dist_f); + } + + if (FLAGS_output_prefix_cut > 0) { + s = CreateOutputFile(type, new_stats.cf_name, "accessed_key_prefix_cut.txt", + &new_stats.a_prefix_cut_f); + if (!FLAGS_key_space_dir.empty()) { + s = CreateOutputFile(type, new_stats.cf_name, "whole_key_prefix_cut.txt", + &new_stats.w_prefix_cut_f); + } + + if (FLAGS_output_qps_stats) { + s = CreateOutputFile(type, new_stats.cf_name, + "accessed_top_k_qps_prefix_cut.txt", + &new_stats.a_top_qps_prefix_f); + } + } + + if (FLAGS_output_time_series) { + s = CreateOutputFile(type, new_stats.cf_name, "time_series.txt", + &new_stats.time_series_f); + } + + if (FLAGS_output_value_distribution) { + s = CreateOutputFile(type, new_stats.cf_name, + "accessed_value_size_distribution.txt", + &new_stats.a_value_size_f); + } + + if (FLAGS_output_key_distribution) { + s = CreateOutputFile(type, new_stats.cf_name, + "accessed_key_size_distribution.txt", + &new_stats.a_key_size_f); + } + + if (FLAGS_output_qps_stats) { + s = CreateOutputFile(type, new_stats.cf_name, "qps_stats.txt", + &new_stats.a_qps_f); + } + + return s; +} + +// create the output path of the files to be opened +Status TraceAnalyzer::CreateOutputFile( + const std::string& type, const std::string& cf_name, + const std::string& ending, + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>* f_ptr) { + std::string path; + path = output_path_ + "/" + FLAGS_output_prefix + "-" + type + "-" + cf_name + + "-" + ending; + Status s; + s = env_->NewWritableFile(path, f_ptr, env_options_); + if (!s.ok()) { + fprintf(stderr, "Cannot open file: %s\n", path.c_str()); + exit(1); + } + return Status::OK(); +} + +// Close the output files in the TraceStats if they are opened +Status TraceAnalyzer::CloseOutputFiles() { + Status s; + for (int type = 0; type < kTaTypeNum; type++) { + if (!ta_[type].enabled) { + continue; + } + for (auto& stat : ta_[type].stats) { + if (s.ok() && stat.second.time_series_f) { + s = stat.second.time_series_f->Close(); + } + + if (s.ok() && stat.second.a_key_f) { + s = stat.second.a_key_f->Close(); + } + + if (s.ok() && stat.second.a_key_num_f) { + s = stat.second.a_key_num_f->Close(); + } + + if (s.ok() && stat.second.a_count_dist_f) { + s = stat.second.a_count_dist_f->Close(); + } + + if (s.ok() && stat.second.a_prefix_cut_f) { + s = stat.second.a_prefix_cut_f->Close(); + } + + if (s.ok() && stat.second.a_value_size_f) { + s = stat.second.a_value_size_f->Close(); + } + + if (s.ok() && stat.second.a_key_size_f) { + s = stat.second.a_key_size_f->Close(); + } + + if (s.ok() && stat.second.a_qps_f) { + s = stat.second.a_qps_f->Close(); + } + + if (s.ok() && stat.second.a_top_qps_prefix_f) { + s = stat.second.a_top_qps_prefix_f->Close(); + } + + if (s.ok() && stat.second.w_key_f) { + s = stat.second.w_key_f->Close(); + } + if (s.ok() && stat.second.w_prefix_cut_f) { + s = stat.second.w_prefix_cut_f->Close(); + } + } + } + return s; +} + +Status TraceAnalyzer::Handle(const WriteQueryTraceRecord& record, + std::unique_ptr<TraceRecordResult>* /*result*/) { + total_writes_++; + // Note that, if the write happens in a transaction, + // 'Write' will be called twice, one for Prepare, one for + // Commit. Thus, in the trace, for the same WriteBatch, there + // will be two records if it is in a transaction. Here, we only + // process the reord that is committed. If write is non-transaction, + // HasBeginPrepare()==false, so we process it normally. + WriteBatch batch(record.GetWriteBatchRep().ToString()); + if (batch.Count() == 0 || (batch.HasBeginPrepare() && !batch.HasCommit())) { + return Status::OK(); + } + write_batch_ts_ = record.GetTimestamp(); + + // write_result_ will be updated in batch's handler during iteration. + Status s = batch.Iterate(this); + write_batch_ts_ = 0; + if (!s.ok()) { + fprintf(stderr, "Cannot process the write batch in the trace\n"); + return s; + } + + return Status::OK(); +} + +Status TraceAnalyzer::Handle(const GetQueryTraceRecord& record, + std::unique_ptr<TraceRecordResult>* /*result*/) { + total_gets_++; + return OutputAnalysisResult(TraceOperationType::kGet, record.GetTimestamp(), + record.GetColumnFamilyID(), + std::move(record.GetKey()), 0); +} + +Status TraceAnalyzer::Handle(const IteratorSeekQueryTraceRecord& record, + std::unique_ptr<TraceRecordResult>* /*result*/) { + TraceOperationType op_type; + if (record.GetSeekType() == IteratorSeekQueryTraceRecord::kSeek) { + op_type = TraceOperationType::kIteratorSeek; + total_seeks_++; + } else { + op_type = TraceOperationType::kIteratorSeekForPrev; + total_seek_prevs_++; + } + + // To do: shall we add lower/upper bounds? + + return OutputAnalysisResult(op_type, record.GetTimestamp(), + record.GetColumnFamilyID(), + std::move(record.GetKey()), 0); +} + +Status TraceAnalyzer::Handle(const MultiGetQueryTraceRecord& record, + std::unique_ptr<TraceRecordResult>* /*result*/) { + total_multigets_++; + + std::vector<uint32_t> cf_ids = record.GetColumnFamilyIDs(); + std::vector<Slice> keys = record.GetKeys(); + std::vector<size_t> value_sizes; + + // If the size does not match is not the error of tracing and anayzing, we + // just report it to the user. The analyzing continues. + if (cf_ids.size() > keys.size()) { + printf("The CF ID vector size does not match the keys vector size!\n"); + // Make the sure the 2 vectors are of the same (smaller) size. + cf_ids.resize(keys.size()); + } else if (cf_ids.size() < keys.size()) { + printf("The CF ID vector size does not match the keys vector size!\n"); + // Make the sure the 2 vectors are of the same (smaller) size. + keys.resize(cf_ids.size()); + } + // Now the 2 vectors must be of the same size. + value_sizes.resize(cf_ids.size(), 0); + + return OutputAnalysisResult(TraceOperationType::kMultiGet, + record.GetTimestamp(), std::move(cf_ids), + std::move(keys), std::move(value_sizes)); +} + +// Handle the Put request in the write batch of the trace +Status TraceAnalyzer::PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + return OutputAnalysisResult(TraceOperationType::kPut, write_batch_ts_, + column_family_id, key, value.size()); +} + +// Handle the Delete request in the write batch of the trace +Status TraceAnalyzer::DeleteCF(uint32_t column_family_id, const Slice& key) { + return OutputAnalysisResult(TraceOperationType::kDelete, write_batch_ts_, + column_family_id, key, 0); +} + +// Handle the SingleDelete request in the write batch of the trace +Status TraceAnalyzer::SingleDeleteCF(uint32_t column_family_id, + const Slice& key) { + return OutputAnalysisResult(TraceOperationType::kSingleDelete, + write_batch_ts_, column_family_id, key, 0); +} + +// Handle the DeleteRange request in the write batch of the trace +Status TraceAnalyzer::DeleteRangeCF(uint32_t column_family_id, + const Slice& begin_key, + const Slice& end_key) { + return OutputAnalysisResult(TraceOperationType::kRangeDelete, write_batch_ts_, + {column_family_id, column_family_id}, + {begin_key, end_key}, {0, 0}); +} + +// Handle the Merge request in the write batch of the trace +Status TraceAnalyzer::MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + return OutputAnalysisResult(TraceOperationType::kMerge, write_batch_ts_, + column_family_id, key, value.size()); +} + +Status TraceAnalyzer::OutputAnalysisResult(TraceOperationType op_type, + uint64_t timestamp, + std::vector<uint32_t> cf_ids, + std::vector<Slice> keys, + std::vector<size_t> value_sizes) { + assert(!cf_ids.empty()); + assert(cf_ids.size() == keys.size()); + assert(cf_ids.size() == value_sizes.size()); + + Status s; + + if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) { + // DeleteRane only writes the begin_key. + size_t cnt = + op_type == TraceOperationType::kRangeDelete ? 1 : cf_ids.size(); + for (size_t i = 0; i < cnt; i++) { + s = WriteTraceSequence(op_type, cf_ids[i], keys[i], value_sizes[i], + timestamp); + if (!s.ok()) { + return Status::Corruption("Failed to write the trace sequence to file"); + } + } + } + + if (ta_[op_type].sample_count >= sample_max_) { + ta_[op_type].sample_count = 0; + } + if (ta_[op_type].sample_count > 0) { + ta_[op_type].sample_count++; + return Status::OK(); + } + ta_[op_type].sample_count++; + + if (!ta_[op_type].enabled) { + return Status::OK(); + } + + for (size_t i = 0; i < cf_ids.size(); i++) { + // Get query does not have value part, just give a fixed value 10 for easy + // calculation. + s = KeyStatsInsertion( + op_type, cf_ids[i], keys[i].ToString(), + value_sizes[i] == 0 ? kShadowValueSize : value_sizes[i], timestamp); + if (!s.ok()) { + return Status::Corruption("Failed to insert key statistics"); + } + } + + return Status::OK(); +} + +Status TraceAnalyzer::OutputAnalysisResult(TraceOperationType op_type, + uint64_t timestamp, uint32_t cf_id, + const Slice& key, + size_t value_size) { + return OutputAnalysisResult( + op_type, timestamp, std::vector<uint32_t>({cf_id}), + std::vector<Slice>({key}), std::vector<size_t>({value_size})); +} + +// Before the analyzer is closed, the requested general statistic results are +// printed out here. In current stage, these information are not output to +// the files. +// -----type +// |__cf_id +// |_statistics +void TraceAnalyzer::PrintStatistics() { + for (int type = 0; type < kTaTypeNum; type++) { + if (!ta_[type].enabled) { + continue; + } + ta_[type].total_keys = 0; + ta_[type].total_access = 0; + ta_[type].total_succ_access = 0; + printf("\n################# Operation Type: %s #####################\n", + ta_[type].type_name.c_str()); + if (qps_ave_.size() == kTaTypeNum + 1) { + printf("Peak QPS is: %u Average QPS is: %f\n", qps_peak_[type], + qps_ave_[type]); + } + for (auto& stat_it : ta_[type].stats) { + if (stat_it.second.a_count == 0) { + continue; + } + TraceStats& stat = stat_it.second; + uint64_t total_a_keys = static_cast<uint64_t>(stat.a_key_stats.size()); + double key_size_ave = 0.0; + double value_size_ave = 0.0; + double key_size_vari = 0.0; + double value_size_vari = 0.0; + if (stat.a_count > 0) { + key_size_ave = + (static_cast<double>(stat.a_key_size_sum)) / stat.a_count; + value_size_ave = + (static_cast<double>(stat.a_value_size_sum)) / stat.a_count; + key_size_vari = std::sqrt((static_cast<double>(stat.a_key_size_sqsum)) / + stat.a_count - + key_size_ave * key_size_ave); + value_size_vari = std::sqrt( + (static_cast<double>(stat.a_value_size_sqsum)) / stat.a_count - + value_size_ave * value_size_ave); + } + if (value_size_ave == 0.0) { + stat.a_value_mid = 0; + } + cfs_[stat.cf_id].a_count += total_a_keys; + ta_[type].total_keys += total_a_keys; + ta_[type].total_access += stat.a_count; + ta_[type].total_succ_access += stat.a_succ_count; + printf("*********************************************************\n"); + printf("colume family id: %u\n", stat.cf_id); + printf("Total number of queries to this cf by %s: %" PRIu64 "\n", + ta_[type].type_name.c_str(), stat.a_count); + printf("Total unique keys in this cf: %" PRIu64 "\n", total_a_keys); + printf("Average key size: %f key size medium: %" PRIu64 + " Key size Variation: %f\n", + key_size_ave, stat.a_key_mid, key_size_vari); + if (type == kPut || type == kMerge) { + printf("Average value size: %f Value size medium: %" PRIu64 + " Value size variation: %f\n", + value_size_ave, stat.a_value_mid, value_size_vari); + } + printf("Peak QPS is: %u Average QPS is: %f\n", stat.a_peak_qps, + stat.a_ave_qps); + + // print the top k accessed key and its access count + if (FLAGS_print_top_k_access > 0) { + printf("The Top %d keys that are accessed:\n", + FLAGS_print_top_k_access); + while (!stat.top_k_queue.empty()) { + std::string hex_key = ROCKSDB_NAMESPACE::LDBCommand::StringToHex( + stat.top_k_queue.top().second); + printf("Access_count: %" PRIu64 " %s\n", stat.top_k_queue.top().first, + hex_key.c_str()); + stat.top_k_queue.pop(); + } + } + + // print the top k access prefix range and + // top k prefix range with highest average access per key + if (FLAGS_output_prefix_cut > 0) { + printf("The Top %d accessed prefix range:\n", FLAGS_print_top_k_access); + while (!stat.top_k_prefix_access.empty()) { + printf("Prefix: %s Access count: %" PRIu64 "\n", + stat.top_k_prefix_access.top().second.c_str(), + stat.top_k_prefix_access.top().first); + stat.top_k_prefix_access.pop(); + } + + printf("The Top %d prefix with highest access per key:\n", + FLAGS_print_top_k_access); + while (!stat.top_k_prefix_ave.empty()) { + printf("Prefix: %s access per key: %f\n", + stat.top_k_prefix_ave.top().second.c_str(), + stat.top_k_prefix_ave.top().first); + stat.top_k_prefix_ave.pop(); + } + } + + // print the operation correlations + if (!FLAGS_print_correlation.empty()) { + for (int correlation = 0; + correlation < + static_cast<int>(analyzer_opts_.correlation_list.size()); + correlation++) { + printf( + "The correlation statistics of '%s' after '%s' is:", + taIndexToOpt[analyzer_opts_.correlation_list[correlation].second] + .c_str(), + taIndexToOpt[analyzer_opts_.correlation_list[correlation].first] + .c_str()); + double correlation_ave = 0.0; + if (stat.correlation_output[correlation].first > 0) { + correlation_ave = + (static_cast<double>( + stat.correlation_output[correlation].second)) / + (stat.correlation_output[correlation].first * 1000); + } + printf(" total numbers: %" PRIu64 " average time: %f(ms)\n", + stat.correlation_output[correlation].first, correlation_ave); + } + } + } + printf("*********************************************************\n"); + printf("Total keys of '%s' is: %" PRIu64 "\n", ta_[type].type_name.c_str(), + ta_[type].total_keys); + printf("Total access is: %" PRIu64 "\n", ta_[type].total_access); + total_access_keys_ += ta_[type].total_keys; + } + + // Print the overall statistic information of the trace + printf("\n*********************************************************\n"); + printf("*********************************************************\n"); + printf("The column family based statistics\n"); + for (auto& cf : cfs_) { + printf("The column family id: %u\n", cf.first); + printf("The whole key space key numbers: %" PRIu64 "\n", cf.second.w_count); + printf("The accessed key space key numbers: %" PRIu64 "\n", + cf.second.a_count); + } + + if (FLAGS_print_overall_stats) { + printf("\n*********************************************************\n"); + printf("*********************************************************\n"); + if (qps_peak_.size() == kTaTypeNum + 1) { + printf("Average QPS per second: %f Peak QPS: %u\n", qps_ave_[kTaTypeNum], + qps_peak_[kTaTypeNum]); + } + printf("The statistics related to query number need to times: %u\n", + sample_max_); + printf("Total_requests: %" PRIu64 " Total_accessed_keys: %" PRIu64 + " Total_gets: %" PRIu64 " Total_write_batches: %" PRIu64 + " Total_seeks: %" PRIu64 " Total_seek_for_prevs: %" PRIu64 + " Total_multigets: %" PRIu64 "\n", + total_requests_, total_access_keys_, total_gets_, total_writes_, + total_seeks_, total_seek_prevs_, total_multigets_); + for (int type = 0; type < kTaTypeNum; type++) { + if (!ta_[type].enabled) { + continue; + } + printf("Operation: '%s' has: %" PRIu64 "\n", ta_[type].type_name.c_str(), + ta_[type].total_access); + } + } +} + +// Write the trace sequence to file +Status TraceAnalyzer::WriteTraceSequence(const uint32_t& type, + const uint32_t& cf_id, + const Slice& key, + const size_t value_size, + const uint64_t ts) { + std::string hex_key = + ROCKSDB_NAMESPACE::LDBCommand::StringToHex(key.ToString()); + int ret; + ret = snprintf(buffer_, sizeof(buffer_), "%u %u %zu %" PRIu64 "\n", type, + cf_id, value_size, ts); + if (ret < 0) { + return Status::IOError("failed to format the output"); + } + std::string printout(buffer_); + if (!FLAGS_no_key) { + printout = hex_key + " " + printout; + } + return trace_sequence_f_->Append(printout); +} + +// The entrance function of Trace_Analyzer +int trace_analyzer_tool(int argc, char** argv) { + std::string trace_path; + std::string output_path; + + AnalyzerOptions analyzer_opts; + + ParseCommandLineFlags(&argc, &argv, true); + + if (!FLAGS_print_correlation.empty()) { + analyzer_opts.SparseCorrelationInput(FLAGS_print_correlation); + } + + std::unique_ptr<TraceAnalyzer> analyzer( + new TraceAnalyzer(FLAGS_trace_path, FLAGS_output_dir, analyzer_opts)); + + if (!analyzer) { + fprintf(stderr, "Cannot initiate the trace analyzer\n"); + exit(1); + } + + ROCKSDB_NAMESPACE::Status s = analyzer->PrepareProcessing(); + if (!s.ok()) { + fprintf(stderr, "%s\n", s.getState()); + fprintf(stderr, "Cannot initiate the trace reader\n"); + exit(1); + } + + s = analyzer->StartProcessing(); + if (!s.ok() && !FLAGS_try_process_corrupted_trace) { + fprintf(stderr, "%s\n", s.getState()); + fprintf(stderr, "Cannot process the trace\n"); + exit(1); + } + + s = analyzer->MakeStatistics(); + if (!s.ok()) { + fprintf(stderr, "%s\n", s.getState()); + analyzer->EndProcessing(); + fprintf(stderr, "Cannot make the statistics\n"); + exit(1); + } + + s = analyzer->ReProcessing(); + if (!s.ok()) { + fprintf(stderr, "%s\n", s.getState()); + fprintf(stderr, "Cannot re-process the trace for more statistics\n"); + analyzer->EndProcessing(); + exit(1); + } + + s = analyzer->EndProcessing(); + if (!s.ok()) { + fprintf(stderr, "%s\n", s.getState()); + fprintf(stderr, "Cannot close the trace analyzer\n"); + exit(1); + } + + return 0; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // Endif of Gflag +#endif // RocksDB LITE diff --git a/src/rocksdb/tools/trace_analyzer_tool.h b/src/rocksdb/tools/trace_analyzer_tool.h new file mode 100644 index 000000000..4b885b18c --- /dev/null +++ b/src/rocksdb/tools/trace_analyzer_tool.h @@ -0,0 +1,326 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <list> +#include <map> +#include <queue> +#include <set> +#include <utility> +#include <vector> + +#include "rocksdb/env.h" +#include "rocksdb/trace_reader_writer.h" +#include "rocksdb/trace_record.h" +#include "rocksdb/write_batch.h" +#include "trace_replay/trace_replay.h" + +namespace ROCKSDB_NAMESPACE { + +// Value sizes may be used as denominators. Replacing 0 value sizes with this +// positive integer avoids division error. +extern const size_t kShadowValueSize /* = 10*/; + +enum TraceOperationType : int { + kGet = 0, + kPut = 1, + kDelete = 2, + kSingleDelete = 3, + kRangeDelete = 4, + kMerge = 5, + kIteratorSeek = 6, + kIteratorSeekForPrev = 7, + kMultiGet = 8, + kTaTypeNum = 9 +}; + +struct TraceUnit { + uint64_t ts; + uint32_t type; + uint32_t cf_id; + size_t value_size; + std::string key; +}; + +struct TypeCorrelation { + uint64_t count; + uint64_t total_ts; +}; + +struct StatsUnit { + uint64_t key_id; + uint64_t access_count; + uint64_t latest_ts; + uint64_t succ_count; // current only used to count Get if key found + uint32_t cf_id; + size_t value_size; + std::vector<TypeCorrelation> v_correlation; +}; + +class AnalyzerOptions { + public: + std::vector<std::vector<int>> correlation_map; + std::vector<std::pair<int, int>> correlation_list; + + AnalyzerOptions(); + + ~AnalyzerOptions(); + + void SparseCorrelationInput(const std::string& in_str); +}; + +// Note that, for the variable names in the trace_analyzer, +// Starting with 'a_' means the variable is used for 'accessed_keys'. +// Starting with 'w_' means it is used for 'the whole key space'. +// Ending with '_f' means a file write or reader pointer. +// For example, 'a_count' means 'accessed_keys_count', +// 'w_key_f' means 'whole_key_space_file'. + +struct TraceStats { + uint32_t cf_id; + std::string cf_name; + uint64_t a_count; + uint64_t a_succ_count; + uint64_t a_key_id; + uint64_t a_key_size_sqsum; + uint64_t a_key_size_sum; + uint64_t a_key_mid; + uint64_t a_value_size_sqsum; + uint64_t a_value_size_sum; + uint64_t a_value_mid; + uint32_t a_peak_qps; + double a_ave_qps; + std::map<std::string, StatsUnit> a_key_stats; + std::map<uint64_t, uint64_t> a_count_stats; + std::map<uint64_t, uint64_t> a_key_size_stats; + std::map<uint64_t, uint64_t> a_value_size_stats; + std::map<uint32_t, uint32_t> a_qps_stats; + std::map<uint32_t, std::map<std::string, uint32_t>> a_qps_prefix_stats; + std::priority_queue<std::pair<uint64_t, std::string>, + std::vector<std::pair<uint64_t, std::string>>, + std::greater<std::pair<uint64_t, std::string>>> + top_k_queue; + std::priority_queue<std::pair<uint64_t, std::string>, + std::vector<std::pair<uint64_t, std::string>>, + std::greater<std::pair<uint64_t, std::string>>> + top_k_prefix_access; + std::priority_queue<std::pair<double, std::string>, + std::vector<std::pair<double, std::string>>, + std::greater<std::pair<double, std::string>>> + top_k_prefix_ave; + std::priority_queue<std::pair<uint32_t, uint32_t>, + std::vector<std::pair<uint32_t, uint32_t>>, + std::greater<std::pair<uint32_t, uint32_t>>> + top_k_qps_sec; + std::list<TraceUnit> time_series; + std::vector<std::pair<uint64_t, uint64_t>> correlation_output; + std::map<uint32_t, uint64_t> uni_key_num; + + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> time_series_f; + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_key_f; + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_count_dist_f; + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_prefix_cut_f; + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_value_size_f; + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_key_size_f; + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_key_num_f; + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_qps_f; + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_top_qps_prefix_f; + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> w_key_f; + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> w_prefix_cut_f; + + TraceStats(); + ~TraceStats(); + TraceStats(const TraceStats&) = delete; + TraceStats& operator=(const TraceStats&) = delete; + TraceStats(TraceStats&&) = default; + TraceStats& operator=(TraceStats&&) = default; +}; + +struct TypeUnit { + std::string type_name; + bool enabled; + uint64_t total_keys; + uint64_t total_access; + uint64_t total_succ_access; + uint32_t sample_count; + std::map<uint32_t, TraceStats> stats; + TypeUnit() = default; + ~TypeUnit() = default; + TypeUnit(const TypeUnit&) = delete; + TypeUnit& operator=(const TypeUnit&) = delete; + TypeUnit(TypeUnit&&) = default; + TypeUnit& operator=(TypeUnit&&) = default; +}; + +struct CfUnit { + uint32_t cf_id; + uint64_t w_count; // total keys in this cf if we use the whole key space + uint64_t a_count; // the total keys in this cf that are accessed + std::map<uint64_t, uint64_t> w_key_size_stats; // whole key space key size + // statistic this cf + std::map<uint32_t, uint32_t> cf_qps; +}; + +class TraceAnalyzer : private TraceRecord::Handler, + private WriteBatch::Handler { + public: + TraceAnalyzer(std::string& trace_path, std::string& output_path, + AnalyzerOptions _analyzer_opts); + ~TraceAnalyzer(); + + Status PrepareProcessing(); + + Status StartProcessing(); + + Status MakeStatistics(); + + Status ReProcessing(); + + Status EndProcessing(); + + Status WriteTraceUnit(TraceUnit& unit); + + std::vector<TypeUnit>& GetTaVector() { return ta_; } + + private: + using TraceRecord::Handler::Handle; + Status Handle(const WriteQueryTraceRecord& record, + std::unique_ptr<TraceRecordResult>* result) override; + Status Handle(const GetQueryTraceRecord& record, + std::unique_ptr<TraceRecordResult>* result) override; + Status Handle(const IteratorSeekQueryTraceRecord& record, + std::unique_ptr<TraceRecordResult>* result) override; + Status Handle(const MultiGetQueryTraceRecord& record, + std::unique_ptr<TraceRecordResult>* result) override; + + using WriteBatch::Handler::PutCF; + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override; + + using WriteBatch::Handler::DeleteCF; + Status DeleteCF(uint32_t column_family_id, const Slice& key) override; + + using WriteBatch::Handler::SingleDeleteCF; + Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override; + + using WriteBatch::Handler::DeleteRangeCF; + Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, + const Slice& end_key) override; + + using WriteBatch::Handler::MergeCF; + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override; + + // The following hanlders are not implemented, return Status::OK() to avoid + // the running time assertion and other irrelevant falures. + using WriteBatch::Handler::PutBlobIndexCF; + Status PutBlobIndexCF(uint32_t /*column_family_id*/, const Slice& /*key*/, + const Slice& /*value*/) override { + return Status::OK(); + } + + // The default implementation of LogData does nothing. + using WriteBatch::Handler::LogData; + void LogData(const Slice& /*blob*/) override {} + + using WriteBatch::Handler::MarkBeginPrepare; + Status MarkBeginPrepare(bool = false) override { return Status::OK(); } + + using WriteBatch::Handler::MarkEndPrepare; + Status MarkEndPrepare(const Slice& /*xid*/) override { return Status::OK(); } + + using WriteBatch::Handler::MarkNoop; + Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); } + + using WriteBatch::Handler::MarkRollback; + Status MarkRollback(const Slice& /*xid*/) override { return Status::OK(); } + + using WriteBatch::Handler::MarkCommit; + Status MarkCommit(const Slice& /*xid*/) override { return Status::OK(); } + + using WriteBatch::Handler::MarkCommitWithTimestamp; + Status MarkCommitWithTimestamp(const Slice& /*xid*/, + const Slice& /*commit_ts*/) override { + return Status::OK(); + } + + // Process each trace operation and output the analysis result to + // stdout/files. + Status OutputAnalysisResult(TraceOperationType op_type, uint64_t timestamp, + std::vector<uint32_t> cf_ids, + std::vector<Slice> keys, + std::vector<size_t> value_sizes); + + Status OutputAnalysisResult(TraceOperationType op_type, uint64_t timestamp, + uint32_t cf_id, const Slice& key, + size_t value_size); + + ROCKSDB_NAMESPACE::Env* env_; + EnvOptions env_options_; + std::unique_ptr<TraceReader> trace_reader_; + size_t offset_; + char buffer_[1024]; + // Timestamp of a WriteBatch, used in its iteration. + uint64_t write_batch_ts_; + std::string trace_name_; + std::string output_path_; + AnalyzerOptions analyzer_opts_; + uint64_t total_requests_; + uint64_t total_access_keys_; + uint64_t total_gets_; + uint64_t total_writes_; + uint64_t total_seeks_; + uint64_t total_seek_prevs_; + uint64_t total_multigets_; + uint64_t trace_create_time_; + uint64_t begin_time_; + uint64_t end_time_; + uint64_t time_series_start_; + uint32_t sample_max_; + uint32_t cur_time_sec_; + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> + trace_sequence_f_; // readable trace + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> qps_f_; // overall qps + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> + cf_qps_f_; // The qps of each CF> + std::vector<TypeUnit> ta_; // The main statistic collecting data structure + std::map<uint32_t, CfUnit> cfs_; // All the cf_id appears in this trace; + std::vector<uint32_t> qps_peak_; + std::vector<double> qps_ave_; + + Status ReadTraceHeader(Trace* header); + Status ReadTraceFooter(Trace* footer); + Status ReadTraceRecord(Trace* trace); + Status KeyStatsInsertion(const uint32_t& type, const uint32_t& cf_id, + const std::string& key, const size_t value_size, + const uint64_t ts); + Status StatsUnitCorrelationUpdate(StatsUnit& unit, const uint32_t& type, + const uint64_t& ts, const std::string& key); + Status OpenStatsOutputFiles(const std::string& type, TraceStats& new_stats); + Status CreateOutputFile( + const std::string& type, const std::string& cf_name, + const std::string& ending, + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>* f_ptr); + Status CloseOutputFiles(); + + void PrintStatistics(); + Status TraceUnitWriter( + std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>& f_ptr, TraceUnit& unit); + Status WriteTraceSequence(const uint32_t& type, const uint32_t& cf_id, + const Slice& key, const size_t value_size, + const uint64_t ts); + Status MakeStatisticKeyStatsOrPrefix(TraceStats& stats); + Status MakeStatisticCorrelation(TraceStats& stats, StatsUnit& unit); + Status MakeStatisticQPS(); + int db_version_; +}; + +int trace_analyzer_tool(int argc, char** argv); + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/tools/verify_random_db.sh b/src/rocksdb/tools/verify_random_db.sh new file mode 100755 index 000000000..fbe5b75fd --- /dev/null +++ b/src/rocksdb/tools/verify_random_db.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# A shell script to verify DB generated by generate_random_db.sh cannot opened and read correct data. +# ./ldb needs to be avaible to be executed. +# +# Usage: <SCRIPT> <DB Path> + +scriptpath=`dirname $BASH_SOURCE` +if [ "$#" -lt 2 ]; then + echo "usage: $BASH_SOURCE <db_directory> <compare_base_db_directory> [dump_file_name] [if_try_load_options] [if_ignore_unknown_options]" + exit 1 +fi + +db_dir=$1 +base_db_dir=$2 +dump_file_name=${3:-"dump_file.txt"} +try_load_options=${4:-"1"} +ignore_unknown_options=${5:-"0"} +db_dump=$db_dir"/"$dump_file_name +base_db_dump=$base_db_dir"/"$dump_file_name +extra_params= + +if [ "$try_load_options" = "0" ]; then + extra_params=" --try_load_options=false" +elif [ "$try_load_options" = "1" ]; then + extra_params=" --try_load_options=true" +fi + +if [ "$ignore_unknown_options" = "1" ]; then + extra_params="$extra_params --ignore_unknown_options" +fi + +set -e +echo == Dumping data from $db_dir to $db_dump +./ldb dump --db=$db_dir $extra_params > $db_dump + +echo == Dumping data from $base_db_dir to $base_db_dump +./ldb dump --db=$base_db_dir $extra_params > $base_db_dump + +diff $db_dump $base_db_dump diff --git a/src/rocksdb/tools/write_external_sst.sh b/src/rocksdb/tools/write_external_sst.sh new file mode 100755 index 000000000..be01ae022 --- /dev/null +++ b/src/rocksdb/tools/write_external_sst.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# + +if [ "$#" -lt 3 ]; then + echo "usage: $BASH_SOURCE <input_data_path> <DB Path> <extern SST dir>" + exit 1 +fi + +input_data_dir=$1 +db_dir=$2 +extern_sst_dir=$3 +rm -rf $db_dir +mkdir -p $extern_sst_dir + +set -e + +n=0 + +for f in `find $input_data_dir -name sorted_data*` +do + echo == Writing external SST file $f to $extern_sst_dir/extern_sst${n} + ./ldb --db=$db_dir --create_if_missing write_extern_sst $extern_sst_dir/extern_sst${n} < $f + let "n = n + 1" +done diff --git a/src/rocksdb/tools/write_stress.cc b/src/rocksdb/tools/write_stress.cc new file mode 100644 index 000000000..ba5bd3f4f --- /dev/null +++ b/src/rocksdb/tools/write_stress.cc @@ -0,0 +1,309 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// +// The goal of this tool is to be a simple stress test with focus on catching: +// * bugs in compaction/flush processes, especially the ones that cause +// assertion errors +// * bugs in the code that deletes obsolete files +// +// There are two parts of the test: +// * write_stress, a binary that writes to the database +// * write_stress_runner.py, a script that invokes and kills write_stress +// +// Here are some interesting parts of write_stress: +// * Runs with very high concurrency of compactions and flushes (32 threads +// total) and tries to create a huge amount of small files +// * The keys written to the database are not uniformly distributed -- there is +// a 3-character prefix that mutates occasionally (in prefix mutator thread), in +// such a way that the first character mutates slower than second, which mutates +// slower than third character. That way, the compaction stress tests some +// interesting compaction features like trivial moves and bottommost level +// calculation +// * There is a thread that creates an iterator, holds it for couple of seconds +// and then iterates over all keys. This is supposed to test RocksDB's abilities +// to keep the files alive when there are references to them. +// * Some writes trigger WAL sync. This is stress testing our WAL sync code. +// * At the end of the run, we make sure that we didn't leak any of the sst +// files +// +// write_stress_runner.py changes the mode in which we run write_stress and also +// kills and restarts it. There are some interesting characteristics: +// * At the beginning we divide the full test runtime into smaller parts -- +// shorter runtimes (couple of seconds) and longer runtimes (100, 1000) seconds +// * The first time we run write_stress, we destroy the old DB. Every next time +// during the test, we use the same DB. +// * We can run in kill mode or clean-restart mode. Kill mode kills the +// write_stress violently. +// * We can run in mode where delete_obsolete_files_with_fullscan is true or +// false +// * We can run with low_open_files mode turned on or off. When it's turned on, +// we configure table cache to only hold a couple of files -- that way we need +// to reopen files every time we access them. +// +// Another goal was to create a stress test without a lot of parameters. So +// tools/write_stress_runner.py should only take one parameter -- runtime_sec +// and it should figure out everything else on its own. + +#include <cstdio> + +#ifndef GFLAGS +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include <atomic> +#include <cinttypes> +#include <random> +#include <set> +#include <string> +#include <thread> + +#include "file/filename.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/system_clock.h" +#include "util/gflags_compat.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +using GFLAGS_NAMESPACE::RegisterFlagValidator; +using GFLAGS_NAMESPACE::SetUsageMessage; + +DEFINE_int32(key_size, 10, "Key size"); +DEFINE_int32(value_size, 100, "Value size"); +DEFINE_string(db, "", "Use the db with the following name."); +DEFINE_bool(destroy_db, true, + "Destroy the existing DB before running the test"); + +DEFINE_int32(runtime_sec, 10 * 60, "How long are we running for, in seconds"); +DEFINE_int32(seed, 139, "Random seed"); + +DEFINE_double(prefix_mutate_period_sec, 1.0, + "How often are we going to mutate the prefix"); +DEFINE_double(first_char_mutate_probability, 0.1, + "How likely are we to mutate the first char every period"); +DEFINE_double(second_char_mutate_probability, 0.2, + "How likely are we to mutate the second char every period"); +DEFINE_double(third_char_mutate_probability, 0.5, + "How likely are we to mutate the third char every period"); + +DEFINE_int32(iterator_hold_sec, 5, + "How long will the iterator hold files before it gets destroyed"); + +DEFINE_double(sync_probability, 0.01, "How often are we syncing writes"); +DEFINE_bool(delete_obsolete_files_with_fullscan, false, + "If true, we delete obsolete files after each compaction/flush " + "using GetChildren() API"); +DEFINE_bool(low_open_files_mode, false, + "If true, we set max_open_files to 20, so that every file access " + "needs to reopen it"); + +namespace ROCKSDB_NAMESPACE { + +static const int kPrefixSize = 3; + +class WriteStress { + public: + WriteStress() : stop_(false) { + // initialize key_prefix + for (int i = 0; i < kPrefixSize; ++i) { + key_prefix_[i].store('a'); + } + + // Choose a location for the test database if none given with --db=<path> + if (FLAGS_db.empty()) { + std::string default_db_path; + Env::Default()->GetTestDirectory(&default_db_path); + default_db_path += "/write_stress"; + FLAGS_db = default_db_path; + } + + Options options; + if (FLAGS_destroy_db) { + DestroyDB(FLAGS_db, options); // ignore + } + + // make the LSM tree deep, so that we have many concurrent flushes and + // compactions + options.create_if_missing = true; + options.write_buffer_size = 256 * 1024; // 256k + options.max_bytes_for_level_base = 1 * 1024 * 1024; // 1MB + options.target_file_size_base = 100 * 1024; // 100k + options.max_write_buffer_number = 16; + options.max_background_compactions = 16; + options.max_background_flushes = 16; + options.max_open_files = FLAGS_low_open_files_mode ? 20 : -1; + if (FLAGS_delete_obsolete_files_with_fullscan) { + options.delete_obsolete_files_period_micros = 0; + } + + // open DB + DB* db; + Status s = DB::Open(options, FLAGS_db, &db); + if (!s.ok()) { + fprintf(stderr, "Can't open database: %s\n", s.ToString().c_str()); + std::abort(); + } + db_.reset(db); + } + + void WriteThread() { + std::mt19937 rng(static_cast<unsigned int>(FLAGS_seed)); + std::uniform_real_distribution<double> dist(0, 1); + + auto random_string = [](std::mt19937& r, int len) { + std::uniform_int_distribution<int> char_dist('a', 'z'); + std::string ret; + for (int i = 0; i < len; ++i) { + ret += static_cast<char>(char_dist(r)); + } + return ret; + }; + + while (!stop_.load(std::memory_order_relaxed)) { + std::string prefix; + prefix.resize(kPrefixSize); + for (int i = 0; i < kPrefixSize; ++i) { + prefix[i] = key_prefix_[i].load(std::memory_order_relaxed); + } + auto key = prefix + random_string(rng, FLAGS_key_size - kPrefixSize); + auto value = random_string(rng, FLAGS_value_size); + WriteOptions woptions; + woptions.sync = dist(rng) < FLAGS_sync_probability; + auto s = db_->Put(woptions, key, value); + if (!s.ok()) { + fprintf(stderr, "Write to DB failed: %s\n", s.ToString().c_str()); + std::abort(); + } + } + } + + void IteratorHoldThread() { + while (!stop_.load(std::memory_order_relaxed)) { + std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions())); + SystemClock::Default()->SleepForMicroseconds(FLAGS_iterator_hold_sec * + 1000 * 1000LL); + for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { + } + if (!iterator->status().ok()) { + fprintf(stderr, "Iterator statuts not OK: %s\n", + iterator->status().ToString().c_str()); + std::abort(); + } + } + } + + void PrefixMutatorThread() { + std::mt19937 rng(static_cast<unsigned int>(FLAGS_seed)); + std::uniform_real_distribution<double> dist(0, 1); + std::uniform_int_distribution<int> char_dist('a', 'z'); + while (!stop_.load(std::memory_order_relaxed)) { + SystemClock::Default()->SleepForMicroseconds( + static_cast<int>(FLAGS_prefix_mutate_period_sec * 1000 * 1000LL)); + if (dist(rng) < FLAGS_first_char_mutate_probability) { + key_prefix_[0].store(static_cast<char>(char_dist(rng)), + std::memory_order_relaxed); + } + if (dist(rng) < FLAGS_second_char_mutate_probability) { + key_prefix_[1].store(static_cast<char>(char_dist(rng)), + std::memory_order_relaxed); + } + if (dist(rng) < FLAGS_third_char_mutate_probability) { + key_prefix_[2].store(static_cast<char>(char_dist(rng)), + std::memory_order_relaxed); + } + } + } + + int Run() { + threads_.emplace_back([&]() { WriteThread(); }); + threads_.emplace_back([&]() { PrefixMutatorThread(); }); + threads_.emplace_back([&]() { IteratorHoldThread(); }); + + if (FLAGS_runtime_sec == -1) { + // infinite runtime, until we get killed + while (true) { + SystemClock::Default()->SleepForMicroseconds(1000 * 1000); + } + } + + SystemClock::Default()->SleepForMicroseconds(FLAGS_runtime_sec * 1000 * + 1000); + + stop_.store(true, std::memory_order_relaxed); + for (auto& t : threads_) { + t.join(); + } + threads_.clear(); + +// Skip checking for leaked files in ROCKSDB_LITE since we don't have access to +// function GetLiveFilesMetaData +#ifndef ROCKSDB_LITE + // let's see if we leaked some files + db_->PauseBackgroundWork(); + std::vector<LiveFileMetaData> metadata; + db_->GetLiveFilesMetaData(&metadata); + std::set<uint64_t> sst_file_numbers; + for (const auto& file : metadata) { + uint64_t number; + FileType type; + if (!ParseFileName(file.name, &number, "LOG", &type)) { + continue; + } + if (type == kTableFile) { + sst_file_numbers.insert(number); + } + } + + std::vector<std::string> children; + Env::Default()->GetChildren(FLAGS_db, &children); + for (const auto& child : children) { + uint64_t number; + FileType type; + if (!ParseFileName(child, &number, "LOG", &type)) { + continue; + } + if (type == kTableFile) { + if (sst_file_numbers.find(number) == sst_file_numbers.end()) { + fprintf(stderr, + "Found a table file in DB path that should have been " + "deleted: %s\n", + child.c_str()); + std::abort(); + } + } + } + db_->ContinueBackgroundWork(); +#endif // !ROCKSDB_LITE + + return 0; + } + + private: + // each key is prepended with this prefix. we occasionally change it. third + // letter is changed more frequently than second, which is changed more + // frequently than the first one. + std::atomic<char> key_prefix_[kPrefixSize]; + std::atomic<bool> stop_; + std::vector<port::Thread> threads_; + std::unique_ptr<DB> db_; +}; + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + ParseCommandLineFlags(&argc, &argv, true); + ROCKSDB_NAMESPACE::WriteStress write_stress; + return write_stress.Run(); +} + +#endif // GFLAGS diff --git a/src/rocksdb/tools/write_stress_runner.py b/src/rocksdb/tools/write_stress_runner.py new file mode 100644 index 000000000..f39f79cd4 --- /dev/null +++ b/src/rocksdb/tools/write_stress_runner.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from __future__ import absolute_import, division, print_function, unicode_literals + +import argparse +import random + +import subprocess +import sys +import time + + +def generate_runtimes(total_runtime): + # combination of short runtimes and long runtimes, with heavier + # weight on short runtimes + possible_runtimes_sec = list(range(1, 10)) + list(range(1, 20)) + [100, 1000] + runtimes = [] + while total_runtime > 0: + chosen = random.choice(possible_runtimes_sec) + chosen = min(chosen, total_runtime) + runtimes.append(chosen) + total_runtime -= chosen + return runtimes + + +def main(args): + runtimes = generate_runtimes(int(args.runtime_sec)) + print( + "Going to execute write stress for " + str(runtimes) + ) # noqa: E999 T25377293 Grandfathered in + first_time = True + + for runtime in runtimes: + kill = random.choice([False, True]) + + cmd = "./write_stress --runtime_sec=" + ("-1" if kill else str(runtime)) + + if len(args.db) > 0: + cmd = cmd + " --db=" + args.db + + if first_time: + first_time = False + else: + # use current db + cmd = cmd + " --destroy_db=false" + if random.choice([False, True]): + cmd = cmd + " --delete_obsolete_files_with_fullscan=true" + if random.choice([False, True]): + cmd = cmd + " --low_open_files_mode=true" + + print( + "Running write_stress for %d seconds (%s): %s" + % (runtime, ("kill-mode" if kill else "clean-shutdown-mode"), cmd) + ) + + child = subprocess.Popen([cmd], shell=True) + killtime = time.time() + runtime + while not kill or time.time() < killtime: + time.sleep(1) + if child.poll() is not None: + if child.returncode == 0: + break + else: + print( + "ERROR: write_stress died with exitcode=%d\n" % child.returncode + ) + sys.exit(1) + if kill: + child.kill() + # breathe + time.sleep(3) + + +if __name__ == "__main__": + random.seed(time.time()) + parser = argparse.ArgumentParser( + description="This script runs and kills \ + write_stress multiple times" + ) + parser.add_argument("--runtime_sec", default="1000") + parser.add_argument("--db", default="") + args = parser.parse_args() + main(args) |